diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-12 15:44:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-12 15:44:27 -0700 |
commit | ac4de9543aca59f2b763746647577302fbedd57e (patch) | |
tree | 40407750569ee030de56233c41c9a97f7e89cf67 | |
parent | 26935fb06ee88f1188789807687c03041f3c70d9 (diff) | |
parent | de32a8177f64bc62e1b19c685dd391af664ab13f (diff) | |
download | lwn-ac4de9543aca59f2b763746647577302fbedd57e.tar.gz lwn-ac4de9543aca59f2b763746647577302fbedd57e.zip |
Merge branch 'akpm' (patches from Andrew Morton)
Merge more patches from Andrew Morton:
"The rest of MM. Plus one misc cleanup"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (35 commits)
mm/Kconfig: add MMU dependency for MIGRATION.
kernel: replace strict_strto*() with kstrto*()
mm, thp: count thp_fault_fallback anytime thp fault fails
thp: consolidate code between handle_mm_fault() and do_huge_pmd_anonymous_page()
thp: do_huge_pmd_anonymous_page() cleanup
thp: move maybe_pmd_mkwrite() out of mk_huge_pmd()
mm: cleanup add_to_page_cache_locked()
thp: account anon transparent huge pages into NR_ANON_PAGES
truncate: drop 'oldsize' truncate_pagecache() parameter
mm: make lru_add_drain_all() selective
memcg: document cgroup dirty/writeback memory statistics
memcg: add per cgroup writeback pages accounting
memcg: check for proper lock held in mem_cgroup_update_page_stat
memcg: remove MEMCG_NR_FILE_MAPPED
memcg: reduce function dereference
memcg: avoid overflow caused by PAGE_ALIGN
memcg: rename RESOURCE_MAX to RES_COUNTER_MAX
memcg: correct RESOURCE_MAX to ULLONG_MAX
mm: memcg: do not trap chargers with full callstack on OOM
mm: memcg: rework and document OOM waiting and wakeup
...
79 files changed, 973 insertions, 919 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 2a3330696372..8af4ad121828 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -490,6 +490,8 @@ pgpgin - # of charging events to the memory cgroup. The charging pgpgout - # of uncharging events to the memory cgroup. The uncharging event happens each time a page is unaccounted from the cgroup. swap - # of bytes of swap usage +writeback - # of bytes of file/anon cache that are queued for syncing to + disk. inactive_anon - # of bytes of anonymous and swap cache memory on inactive LRU list. active_anon - # of bytes of anonymous and swap cache memory on active diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 0c4132dd3507..98838a05ba6d 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -89,8 +89,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr, const struct exception_table_entry *fixup; int fault, si_code = SEGV_MAPERR; siginfo_t info; - unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (cause > 0 ? FAULT_FLAG_WRITE : 0)); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* As of EV6, a load into $31/$f31 is a prefetch, and never faults (or is suppressed by the PALcode). Support that for older CPUs @@ -115,7 +114,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr, if (address >= TASK_SIZE) goto vmalloc_fault; #endif - + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -142,6 +142,7 @@ retry: } else { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } /* If for any reason at all we couldn't handle the fault, diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 0fd1f0d515ff..d63f3de0cd5b 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -60,8 +60,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address) siginfo_t info; int fault, ret; int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */ - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * We fault-in kernel-space virtual memory on-demand. The @@ -89,6 +88,8 @@ void do_page_fault(struct pt_regs *regs, unsigned long address) if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -117,12 +118,12 @@ good_area: if (write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } -survive: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -201,10 +202,6 @@ no_context: die("Oops", regs, address); out_of_memory: - if (is_global_init(tsk)) { - yield(); - goto survive; - } up_read(&mm->mmap_sem); if (user_mode(regs)) { diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index c97f7940cb95..eb8830a4c5ed 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -261,9 +261,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) struct task_struct *tsk; struct mm_struct *mm; int fault, sig, code; - int write = fsr & FSR_WRITE; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (notify_page_fault(regs, fsr)) return 0; @@ -282,6 +280,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (fsr & FSR_WRITE) + flags |= FAULT_FLAG_WRITE; + /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, @@ -349,6 +352,13 @@ retry: if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) return 0; + /* + * If we are in kernel mode at this point, we + * have no context to handle this fault with. + */ + if (!user_mode(regs)) + goto no_context; + if (fault & VM_FAULT_OOM) { /* * We ran out of memory, call the OOM killer, and return to @@ -359,13 +369,6 @@ retry: return 0; } - /* - * If we are in kernel mode at this point, we - * have no context to handle this fault with. - */ - if (!user_mode(regs)) - goto no_context; - if (fault & VM_FAULT_SIGBUS) { /* * We had some memory, but were unable to diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 6c8ba25bf6bb..6d6acf153bff 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -199,13 +199,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - if (esr & ESR_LNX_EXEC) { - vm_flags = VM_EXEC; - } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { - vm_flags = VM_WRITE; - mm_flags |= FAULT_FLAG_WRITE; - } - tsk = current; mm = tsk->mm; @@ -220,6 +213,16 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + mm_flags |= FAULT_FLAG_USER; + + if (esr & ESR_LNX_EXEC) { + vm_flags = VM_EXEC; + } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { + vm_flags = VM_WRITE; + mm_flags |= FAULT_FLAG_WRITE; + } + /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, @@ -288,6 +291,13 @@ retry: VM_FAULT_BADACCESS)))) return 0; + /* + * If we are in kernel mode at this point, we have no context to + * handle this fault with. + */ + if (!user_mode(regs)) + goto no_context; + if (fault & VM_FAULT_OOM) { /* * We ran out of memory, call the OOM killer, and return to @@ -298,13 +308,6 @@ retry: return 0; } - /* - * If we are in kernel mode at this point, we have no context to - * handle this fault with. - */ - if (!user_mode(regs)) - goto no_context; - if (fault & VM_FAULT_SIGBUS) { /* * We had some memory, but were unable to successfully fix up diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index b2f2d2d66849..0eca93327195 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c @@ -86,6 +86,8 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs) local_irq_enable(); + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); @@ -228,9 +230,9 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - pagefault_out_of_memory(); if (!user_mode(regs)) goto no_context; + pagefault_out_of_memory(); return; do_sigbus: diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index 73312ab6c696..1790f22e71a2 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c @@ -58,8 +58,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs, struct vm_area_struct * vma; siginfo_t info; int fault; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - ((writeaccess & 1) ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; D(printk(KERN_DEBUG "Page fault for %lX on %X at %lX, prot %d write %d\n", @@ -117,6 +116,8 @@ do_page_fault(unsigned long address, struct pt_regs *regs, if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -155,6 +156,7 @@ retry: } else if (writeaccess == 1) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c index 331c1e2cfb67..9a66372fc7c7 100644 --- a/arch/frv/mm/fault.c +++ b/arch/frv/mm/fault.c @@ -34,11 +34,11 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear struct vm_area_struct *vma; struct mm_struct *mm; unsigned long _pme, lrai, lrad, fixup; + unsigned long flags = 0; siginfo_t info; pgd_t *pge; pud_t *pue; pte_t *pte; - int write; int fault; #if 0 @@ -81,6 +81,9 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear if (in_atomic() || !mm) goto no_context; + if (user_mode(__frame)) + flags |= FAULT_FLAG_USER; + down_read(&mm->mmap_sem); vma = find_vma(mm, ear0); @@ -129,7 +132,6 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear */ good_area: info.si_code = SEGV_ACCERR; - write = 0; switch (esr0 & ESR0_ATXC) { default: /* handle write to write protected page */ @@ -140,7 +142,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear #endif if (!(vma->vm_flags & VM_WRITE)) goto bad_area; - write = 1; + flags |= FAULT_FLAG_WRITE; break; /* handle read from protected page */ @@ -162,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(mm, vma, ear0, flags); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 1bd276dbec7d..8704c9320032 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -53,8 +53,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) int si_code = SEGV_MAPERR; int fault; const struct exception_table_entry *fixup; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (cause > 0 ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * If we're in an interrupt or have no user context, @@ -65,6 +64,8 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs) local_irq_enable(); + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -96,6 +97,7 @@ good_area: case FLT_STORE: if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; break; } diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 6cf0341f978e..7225dad87094 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -90,8 +90,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT) | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT)); - flags |= ((mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0); - /* mmap_sem is performance critical.... */ prefetchw(&mm->mmap_sem); @@ -119,6 +117,10 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re if (notify_page_fault(regs, TRAP_BRKPT)) return; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (mask & VM_WRITE) + flags |= FAULT_FLAG_WRITE; retry: down_read(&mm->mmap_sem); diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c index 3cdfa9c1d091..e9c6a8014bd6 100644 --- a/arch/m32r/mm/fault.c +++ b/arch/m32r/mm/fault.c @@ -78,7 +78,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, struct mm_struct *mm; struct vm_area_struct * vma; unsigned long page, addr; - int write; + unsigned long flags = 0; int fault; siginfo_t info; @@ -117,6 +117,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, if (in_atomic() || !mm) goto bad_area_nosemaphore; + if (error_code & ACE_USERMODE) + flags |= FAULT_FLAG_USER; + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -166,14 +169,13 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, */ good_area: info.si_code = SEGV_ACCERR; - write = 0; switch (error_code & (ACE_WRITE|ACE_PROTECTION)) { default: /* 3: write, present */ /* fall through */ case ACE_WRITE: /* write, not present */ if (!(vma->vm_flags & VM_WRITE)) goto bad_area; - write++; + flags |= FAULT_FLAG_WRITE; break; case ACE_PROTECTION: /* read, present */ case 0: /* read, not present */ @@ -194,7 +196,7 @@ good_area: */ addr = (address & PAGE_MASK); set_thread_fault_code(error_code); - fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(mm, vma, addr, flags); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index a563727806bf..eb1d61f68725 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -88,6 +88,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c index 8fddf46e6c62..332680e5ebf2 100644 --- a/arch/metag/mm/fault.c +++ b/arch/metag/mm/fault.c @@ -53,8 +53,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, struct vm_area_struct *vma, *prev_vma; siginfo_t info; int fault; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write_access ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; @@ -109,6 +108,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); @@ -121,6 +122,7 @@ good_area: if (write_access) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 731f739d17a1..fa4cf52aa7a6 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -92,8 +92,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, int code = SEGV_MAPERR; int is_write = error_code & ESR_S; int fault; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (is_write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; regs->ear = address; regs->esr = error_code; @@ -121,6 +120,9 @@ void do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -199,6 +201,7 @@ good_area: if (unlikely(is_write)) { if (unlikely(!(vma->vm_flags & VM_WRITE))) goto bad_area; + flags |= FAULT_FLAG_WRITE; /* a read */ } else { /* protection fault */ diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 85df1cd8d446..becc42bb1849 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -42,8 +42,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, const int field = sizeof(unsigned long) * 2; siginfo_t info; int fault; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; #if 0 printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(), @@ -93,6 +92,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write, if (in_atomic() || !mm) goto bad_area_nosemaphore; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -114,6 +115,7 @@ good_area: if (write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (cpu_has_rixi) { if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) { @@ -241,6 +243,8 @@ out_of_memory: * (which will retry the fault, or kill us if we got oom-killed). */ up_read(&mm->mmap_sem); + if (!user_mode(regs)) + goto no_context; pagefault_out_of_memory(); return; diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index 8a2e6ded9a44..3516cbdf1ee9 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c @@ -171,6 +171,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code, if (in_atomic() || !mm) goto no_context; + if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 4a41f8493ab0..0703acf7d327 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -86,6 +86,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address, if (user_mode(regs)) { /* Exception was in userspace: reenable interrupts */ local_irq_enable(); + flags |= FAULT_FLAG_USER; } else { /* If exception was in a syscall, then IRQ's may have * been enabled or disabled. If they were enabled, diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index f247a3480e8e..d10d27a720c0 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -180,6 +180,10 @@ void do_page_fault(struct pt_regs *regs, unsigned long code, if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (acc_type & VM_WRITE) + flags |= FAULT_FLAG_WRITE; retry: down_read(&mm->mmap_sem); vma = find_vma_prev(mm, address, &prev_vma); @@ -203,8 +207,7 @@ good_area: * fault. */ - fault = handle_mm_fault(mm, vma, address, - flags | ((acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0)); + fault = handle_mm_fault(mm, vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 2dd69bf4af46..51ab9e7e6c39 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -223,9 +223,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, is_write = error_code & ESR_DST; #endif /* CONFIG_4xx || CONFIG_BOOKE */ - if (is_write) - flags |= FAULT_FLAG_WRITE; - #ifdef CONFIG_PPC_ICSWX /* * we need to do this early because this "data storage @@ -288,6 +285,9 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, if (user_mode(regs)) store_update_sp = store_updates_sp(regs); + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an @@ -415,6 +415,7 @@ good_area: } else if (is_write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; /* a read */ } else { /* protection fault */ diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 7de4469915f0..fc6679210d83 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -302,6 +302,8 @@ static inline int do_exception(struct pt_regs *regs, int access) address = trans_exc_code & __FAIL_ADDR_MASK; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) flags |= FAULT_FLAG_WRITE; down_read(&mm->mmap_sem); diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c index 6b18fb0189ae..52238983527d 100644 --- a/arch/score/mm/fault.c +++ b/arch/score/mm/fault.c @@ -47,6 +47,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; const int field = sizeof(unsigned long) * 2; + unsigned long flags = 0; siginfo_t info; int fault; @@ -75,6 +76,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write, if (in_atomic() || !mm) goto bad_area_nosemaphore; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + down_read(&mm->mmap_sem); vma = find_vma(mm, address); if (!vma) @@ -95,18 +99,18 @@ good_area: if (write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) goto bad_area; } -survive: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write); + fault = handle_mm_fault(mm, vma, address, flags); if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; @@ -167,11 +171,6 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_global_init(tsk)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } if (!user_mode(regs)) goto no_context; pagefault_out_of_memory(); diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 1f49c28affa9..541dc6101508 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -400,9 +400,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, struct mm_struct *mm; struct vm_area_struct * vma; int fault; - int write = error_code & FAULT_CODE_WRITE; - unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0)); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; @@ -476,6 +474,11 @@ good_area: set_thread_fault_code(error_code); + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (error_code & FAULT_CODE_WRITE) + flags |= FAULT_FLAG_WRITE; + /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index e98bfda205a2..59dbd4645725 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -177,8 +177,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write, unsigned long g2; int from_user = !(regs->psr & PSR_PS); int fault, code; - unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0)); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (text_fault) address = regs->pc; @@ -235,6 +234,11 @@ good_area: goto bad_area; } + if (from_user) + flags |= FAULT_FLAG_USER; + if (write) + flags |= FAULT_FLAG_WRITE; + /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -383,6 +387,7 @@ static void force_user_fault(unsigned long address, int write) struct vm_area_struct *vma; struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; + unsigned int flags = FAULT_FLAG_USER; int code; code = SEGV_MAPERR; @@ -402,11 +407,12 @@ good_area: if (write) { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } - switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) { + switch (handle_mm_fault(mm, vma, address, flags)) { case VM_FAULT_SIGBUS: case VM_FAULT_OOM: goto do_sigbus; diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 5062ff389e83..2ebec263d685 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -315,7 +315,8 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) bad_kernel_pc(regs, address); return; } - } + } else + flags |= FAULT_FLAG_USER; /* * If we're in an interrupt or have no user @@ -418,13 +419,14 @@ good_area: vma->vm_file != NULL) set_thread_fault_code(fault_code | FAULT_CODE_BLKCOMMIT); + + flags |= FAULT_FLAG_WRITE; } else { /* Allow reads even for write-only mappings */ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } - flags |= ((fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0); fault = handle_mm_fault(mm, vma, address, flags); if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 111d5a9b76f1..4c288f199453 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -280,8 +280,7 @@ static int handle_page_fault(struct pt_regs *regs, if (!is_page_fault) write = 1; - flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0)); + flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; is_kernel_mode = !user_mode(regs); @@ -365,6 +364,9 @@ static int handle_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } + if (!is_kernel_mode) + flags |= FAULT_FLAG_USER; + /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -425,12 +427,12 @@ good_area: #endif if (!(vma->vm_flags & VM_WRITE)) goto bad_area; + flags |= FAULT_FLAG_WRITE; } else { if (!is_page_fault || !(vma->vm_flags & VM_READ)) goto bad_area; } - survive: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -555,11 +557,6 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_global_init(tsk)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } if (is_kernel_mode) goto no_context; pagefault_out_of_memory(); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 089f3987e273..5c3aef74237f 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -30,8 +30,7 @@ int handle_page_fault(unsigned long address, unsigned long ip, pmd_t *pmd; pte_t *pte; int err = -EFAULT; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (is_write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; *code_out = SEGV_MAPERR; @@ -42,6 +41,8 @@ int handle_page_fault(unsigned long address, unsigned long ip, if (in_atomic()) goto out_nosemaphore; + if (is_user) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); @@ -58,12 +59,15 @@ retry: good_area: *code_out = SEGV_ACCERR; - if (is_write && !(vma->vm_flags & VM_WRITE)) - goto out; - - /* Don't require VM_READ|VM_EXEC for write faults! */ - if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC))) - goto out; + if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) + goto out; + flags |= FAULT_FLAG_WRITE; + } else { + /* Don't require VM_READ|VM_EXEC for write faults! */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto out; + } do { int fault; @@ -124,6 +128,8 @@ out_of_memory: * (which will retry the fault, or kill us if we got oom-killed). */ up_read(&mm->mmap_sem); + if (!is_user) + goto out_nosemaphore; pagefault_out_of_memory(); return 0; } diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c index f9b5c10bccee..0dc922dba915 100644 --- a/arch/unicore32/mm/fault.c +++ b/arch/unicore32/mm/fault.c @@ -209,8 +209,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) struct task_struct *tsk; struct mm_struct *mm; int fault, sig, code; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - ((!(fsr ^ 0x12)) ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; @@ -222,6 +221,11 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs) if (in_atomic() || !mm) goto no_context; + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (!(fsr ^ 0x12)) + flags |= FAULT_FLAG_WRITE; + /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, @@ -278,6 +282,13 @@ retry: (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)))) return 0; + /* + * If we are in kernel mode at this point, we + * have no context to handle this fault with. + */ + if (!user_mode(regs)) + goto no_context; + if (fault & VM_FAULT_OOM) { /* * We ran out of memory, call the OOM killer, and return to @@ -288,13 +299,6 @@ retry: return 0; } - /* - * If we are in kernel mode at this point, we - * have no context to handle this fault with. - */ - if (!user_mode(regs)) - goto no_context; - if (fault & VM_FAULT_SIGBUS) { /* * We had some memory, but were unable to diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 654be4ae3047..3aaeffcfd67a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -842,23 +842,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, force_sig_info_fault(SIGBUS, code, address, tsk, fault); } -static noinline int +static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { - /* - * Pagefault was interrupted by SIGKILL. We have no reason to - * continue pagefault. - */ - if (fatal_signal_pending(current)) { - if (!(fault & VM_FAULT_RETRY)) - up_read(¤t->mm->mmap_sem); - if (!(error_code & PF_USER)) - no_context(regs, error_code, address, 0, 0); - return 1; + if (fatal_signal_pending(current) && !(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address, 0, 0); + return; } - if (!(fault & VM_FAULT_ERROR)) - return 0; if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ @@ -866,7 +858,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, up_read(¤t->mm->mmap_sem); no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); - return 1; + return; } up_read(¤t->mm->mmap_sem); @@ -884,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, else BUG(); } - return 1; } static int spurious_fault_check(unsigned long error_code, pte_t *pte) @@ -1011,9 +1002,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long address; struct mm_struct *mm; int fault; - int write = error_code & PF_WRITE; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; @@ -1083,6 +1072,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; + flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); @@ -1109,6 +1099,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) return; } + if (error_code & PF_WRITE) + flags |= FAULT_FLAG_WRITE; + /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in @@ -1187,9 +1180,17 @@ good_area: */ fault = handle_mm_fault(mm, vma, address, flags); - if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { - if (mm_fault_error(regs, error_code, address, fault)) - return; + /* + * If we need to retry but a fatal signal is pending, handle the + * signal first. We do not need to release the mmap_sem because it + * would already be released in __lock_page_or_retry in mm/filemap.c. + */ + if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) + return; + + if (unlikely(fault & VM_FAULT_ERROR)) { + mm_fault_error(regs, error_code, address, fault); + return; } /* diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 4b7bc8db170f..70fa7bc42b4a 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -72,6 +72,8 @@ void do_page_fault(struct pt_regs *regs) address, exccause, regs->pc, is_write? "w":"", is_exec? "x":""); #endif + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; retry: down_read(&mm->mmap_sem); vma = find_vma(mm, address); diff --git a/drivers/base/node.c b/drivers/base/node.c index 7616a77ca322..bc9f43bf7e29 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -125,13 +125,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(nid, NR_WRITEBACK)), nid, K(node_page_state(nid, NR_FILE_PAGES)), nid, K(node_page_state(nid, NR_FILE_MAPPED)), -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - nid, K(node_page_state(nid, NR_ANON_PAGES) - + node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * - HPAGE_PMD_NR), -#else nid, K(node_page_state(nid, NR_ANON_PAGES)), -#endif nid, K(node_page_state(nid, NR_SHMEM)), nid, node_page_state(nid, NR_KERNEL_STACK) * THREAD_SIZE / 1024, diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 5f95d1ed9c6d..b9acadafa4a1 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -50,7 +50,7 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } static int adfs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/affs/file.c b/fs/affs/file.c index 776e3935a758..8669b6ecddee 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -406,7 +406,7 @@ static void affs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); affs_truncate(inode); } } diff --git a/fs/bfs/file.c b/fs/bfs/file.c index ad3ea1497cc3..ae2892218335 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -166,7 +166,7 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } static int bfs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ef3bea7bb257..3f0ddfce96e6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -221,12 +221,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_path *path, struct inode *inode) { - loff_t oldsize; int ret = 0; - oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); - truncate_pagecache(inode, oldsize, 0); + truncate_pagecache(inode, 0); /* * We don't need an orphan item because truncating the free space cache diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index db1e43948579..f338c5672d58 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4349,7 +4349,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); if (newsize > oldsize) { - truncate_pagecache(inode, oldsize, newsize); + truncate_pagecache(inode, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) return ret; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index e3bb6477c83f..f9ff9c173f78 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1856,14 +1856,11 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) static void cifs_setsize(struct inode *inode, loff_t offset) { - loff_t oldsize; - spin_lock(&inode->i_lock); - oldsize = inode->i_size; i_size_write(inode, offset); spin_unlock(&inode->i_lock); - truncate_pagecache(inode, oldsize, offset); + truncate_pagecache(inode, offset); } static int diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 2ec8eb1ab269..a52a5d23c30b 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -861,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) static void _write_failed(struct inode *inode, loff_t to) { if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } int exofs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0a87bb10998d..c260de6d7b6d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -58,7 +58,7 @@ static void ext2_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); ext2_truncate_blocks(inode, inode->i_size); } } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c79fd7dabe79..0d424d7ac02b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4587,7 +4587,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { handle_t *handle; - loff_t oldsize = inode->i_size; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4650,7 +4649,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ - truncate_pagecache(inode, oldsize, inode->i_size); + truncate_pagecache(inode, inode->i_size); } /* * We want to call ext4_truncate() even if attr->ia_size == diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 11b51bb55b42..0062da21dd8b 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -147,7 +147,7 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); fat_truncate_blocks(inode, inode->i_size); } } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3ac91086f41f..62b43b577bfc 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1678,7 +1678,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. */ if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { - truncate_pagecache(inode, oldsize, outarg.attr.size); + truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(inode->i_mapping); } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 84434594e80e..a8ce6dab60a0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -218,7 +218,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, bool inval = false; if (oldsize != attr->size) { - truncate_pagecache(inode, oldsize, attr->size); + truncate_pagecache(inode, attr->size); inval = true; } else if (fc->auto_inval_data) { struct timespec new_mtime = { diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5e2f56fccf6b..62a65fc448dc 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1016,7 +1016,7 @@ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize chunk = oldsize - newsize; if (chunk > max_chunk) chunk = max_chunk; - truncate_pagecache(inode, oldsize, oldsize - chunk); + truncate_pagecache(inode, oldsize - chunk); oldsize -= chunk; gfs2_trans_end(sdp); error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); @@ -1067,7 +1067,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) if (journaled) error = gfs2_journaled_truncate(inode, oldsize, newsize); else - truncate_pagecache(inode, oldsize, newsize); + truncate_pagecache(inode, newsize); if (error) { brelse(dibh); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index f9299d8a64e3..380ab31b5e0f 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -41,7 +41,7 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); hfs_file_truncate(inode); } } diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 4d2edaea891c..37213d075f3c 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -36,7 +36,7 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); hfsplus_file_truncate(inode); } } diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 4e9dabcf1f4c..67c1a61e0955 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -138,7 +138,7 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) hpfs_lock(inode->i_sb); if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); hpfs_truncate(inode); } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 730f24e282a6..f4aab719add5 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -306,7 +306,7 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); jfs_truncate(inode); } } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index df122496f328..0332109162a5 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -400,7 +400,7 @@ static void minix_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); minix_truncate(inode); } } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 87e797640828..eda8879171c4 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -541,7 +541,6 @@ EXPORT_SYMBOL_GPL(nfs_setattr); */ static int nfs_vmtruncate(struct inode * inode, loff_t offset) { - loff_t oldsize; int err; err = inode_newsize_ok(inode, offset); @@ -549,11 +548,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) goto out; spin_lock(&inode->i_lock); - oldsize = inode->i_size; i_size_write(inode, offset); spin_unlock(&inode->i_lock); - truncate_pagecache(inode, oldsize, offset); + truncate_pagecache(inode, offset); out: return err; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b1a5277cfd18..7e350c562e0e 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -254,7 +254,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); nilfs_truncate(inode); } } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index c5670b8d198c..ea4ba9daeb47 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1768,7 +1768,7 @@ static void ntfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); ntfs_truncate_vfs(inode); } } diff --git a/fs/omfs/file.c b/fs/omfs/file.c index e0d9b3e722bd..54d57d6ba68d 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -311,7 +311,7 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); omfs_truncate(inode); } } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 5aa847a603c0..59d85d608898 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -132,13 +132,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), K(global_page_state(NR_WRITEBACK)), -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - K(global_page_state(NR_ANON_PAGES) - + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * - HPAGE_PMD_NR), -#else K(global_page_state(NR_ANON_PAGES)), -#endif K(global_page_state(NR_FILE_MAPPED)), K(global_page_state(NR_SHMEM)), K(global_page_state(NR_SLAB_RECLAIMABLE) + diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index c1a591a4725b..66bc316927e8 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -469,7 +469,7 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); sysv_truncate(inode); } } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index b6d15d349810..062b7925bca0 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -172,7 +172,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) loff_t isize = inode->i_size; if (to > isize) { - truncate_pagecache(inode, to, isize); + truncate_pagecache(inode, isize); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index ff24e4449ece..c8ca96086784 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -531,7 +531,7 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } static int ufs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 977da0ec6604..e51e581454e9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1582,7 +1582,7 @@ xfs_vm_write_begin( unlock_page(page); if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, pos + len, i_size_read(inode)); + truncate_pagecache(inode, i_size_read(inode)); page_cache_release(page); page = NULL; @@ -1618,7 +1618,7 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, to, isize); + truncate_pagecache(inode, isize); xfs_vm_kill_delalloc_range(inode, isize, to); } } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b60de92e2edc..3935428c57cf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -96,9 +96,6 @@ extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end); -extern int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, unsigned int flags); extern int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6c416092e324..60e95872da29 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -30,9 +30,21 @@ struct page; struct mm_struct; struct kmem_cache; -/* Stats that can be updated by kernel. */ -enum mem_cgroup_page_stat_item { - MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ +/* + * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c, + * These two lists should keep in accord with each other. + */ +enum mem_cgroup_stat_index { + /* + * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. + */ + MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ + MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ + MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ + MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ + MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ + MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ + MEM_CGROUP_STAT_NSTATS, }; struct mem_cgroup_reclaim_cookie { @@ -41,6 +53,23 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; +enum mem_cgroup_filter_t { + VISIT, /* visit current node */ + SKIP, /* skip the current node and continue traversal */ + SKIP_TREE, /* skip the whole subtree and continue traversal */ +}; + +/* + * mem_cgroup_filter_t predicate might instruct mem_cgroup_iter_cond how to + * iterate through the hierarchy tree. Each tree element is checked by the + * predicate before it is returned by the iterator. If a filter returns + * SKIP or SKIP_TREE then the iterator code continues traversal (with the + * next node down the hierarchy or the next node that doesn't belong under the + * memcg's subtree). + */ +typedef enum mem_cgroup_filter_t +(*mem_cgroup_iter_filter)(struct mem_cgroup *memcg, struct mem_cgroup *root); + #ifdef CONFIG_MEMCG /* * All "charge" functions with gfp_mask should use GFP_KERNEL or @@ -108,9 +137,18 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok); -struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, - struct mem_cgroup *, - struct mem_cgroup_reclaim_cookie *); +struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim, + mem_cgroup_iter_filter cond); + +static inline struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) +{ + return mem_cgroup_iter_cond(root, prev, reclaim, NULL); +} + void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); /* @@ -125,6 +163,48 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, extern void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage); +/** + * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task + * @new: true to enable, false to disable + * + * Toggle whether a failed memcg charge should invoke the OOM killer + * or just return -ENOMEM. Returns the previous toggle state. + * + * NOTE: Any path that enables the OOM killer before charging must + * call mem_cgroup_oom_synchronize() afterward to finalize the + * OOM handling and clean up. + */ +static inline bool mem_cgroup_toggle_oom(bool new) +{ + bool old; + + old = current->memcg_oom.may_oom; + current->memcg_oom.may_oom = new; + + return old; +} + +static inline void mem_cgroup_enable_oom(void) +{ + bool old = mem_cgroup_toggle_oom(true); + + WARN_ON(old == true); +} + +static inline void mem_cgroup_disable_oom(void) +{ + bool old = mem_cgroup_toggle_oom(false); + + WARN_ON(old == false); +} + +static inline bool task_in_memcg_oom(struct task_struct *p) +{ + return p->memcg_oom.in_memcg_oom; +} + +bool mem_cgroup_oom_synchronize(void); + #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; #endif @@ -165,24 +245,24 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, } void mem_cgroup_update_page_stat(struct page *page, - enum mem_cgroup_page_stat_item idx, + enum mem_cgroup_stat_index idx, int val); static inline void mem_cgroup_inc_page_stat(struct page *page, - enum mem_cgroup_page_stat_item idx) + enum mem_cgroup_stat_index idx) { mem_cgroup_update_page_stat(page, idx, 1); } static inline void mem_cgroup_dec_page_stat(struct page *page, - enum mem_cgroup_page_stat_item idx) + enum mem_cgroup_stat_index idx) { mem_cgroup_update_page_stat(page, idx, -1); } -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, - unsigned long *total_scanned); +enum mem_cgroup_filter_t +mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, + struct mem_cgroup *root); void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, @@ -296,6 +376,15 @@ static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok) { } +static inline struct mem_cgroup * +mem_cgroup_iter_cond(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim, + mem_cgroup_iter_filter cond) +{ + /* first call must return non-NULL, second return NULL */ + return (struct mem_cgroup *)(unsigned long)!prev; +} static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, @@ -348,22 +437,45 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, { } +static inline bool mem_cgroup_toggle_oom(bool new) +{ + return false; +} + +static inline void mem_cgroup_enable_oom(void) +{ +} + +static inline void mem_cgroup_disable_oom(void) +{ +} + +static inline bool task_in_memcg_oom(struct task_struct *p) +{ + return false; +} + +static inline bool mem_cgroup_oom_synchronize(void) +{ + return false; +} + static inline void mem_cgroup_inc_page_stat(struct page *page, - enum mem_cgroup_page_stat_item idx) + enum mem_cgroup_stat_index idx) { } static inline void mem_cgroup_dec_page_stat(struct page *page, - enum mem_cgroup_page_stat_item idx) + enum mem_cgroup_stat_index idx) { } static inline -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, - unsigned long *total_scanned) +enum mem_cgroup_filter_t +mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, + struct mem_cgroup *root) { - return 0; + return VISIT; } static inline void mem_cgroup_split_huge_fixup(struct page *head) diff --git a/include/linux/mm.h b/include/linux/mm.h index caf543c7eaa7..8b6e55ee8855 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -176,6 +176,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ #define FAULT_FLAG_TRIED 0x40 /* second try */ +#define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's @@ -876,11 +877,12 @@ static inline int page_mapped(struct page *page) #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ +#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ - VM_FAULT_HWPOISON_LARGE) + VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE) /* Encode hstate index for a hwpoisoned large page */ #define VM_FAULT_SET_HINDEX(x) ((x) << 12) @@ -984,7 +986,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, unmap_mapping_range(mapping, holebegin, holelen, 0); } -extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); +extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 96a509b6be04..201a69749659 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -54,7 +54,7 @@ struct res_counter { struct res_counter *parent; }; -#define RESOURCE_MAX (unsigned long long)LLONG_MAX +#define RES_COUNTER_MAX ULLONG_MAX /** * Helpers to interact with userspace diff --git a/include/linux/sched.h b/include/linux/sched.h index 45f254dddafc..6682da36b293 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1393,6 +1393,13 @@ struct task_struct { unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ } memcg_batch; unsigned int memcg_kmem_skip_account; + struct memcg_oom_info { + unsigned int may_oom:1; + unsigned int in_memcg_oom:1; + unsigned int oom_locked:1; + int wakeups; + struct mem_cgroup *wait_on_memcg; + } memcg_oom; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; diff --git a/include/linux/swap.h b/include/linux/swap.h index c03c139219c9..46ba0c6c219f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -280,7 +280,7 @@ extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); -extern int lru_add_drain_all(void); +extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_page(struct page *page); extern void swap_setup(void); diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 9bd0934f6c33..7a7d2ee96d42 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str) { unsigned long val; - if (strict_strtoul(str, 0, &val)) { + if (kstrtoul(str, 0, &val)) { pr_warning("invalid gcov_persist parameter '%s'\n", str); return 0; } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 6ada93c23a9a..9659d38e008f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, unsigned long cnt; int ret; - if (strict_strtoul(buf, 0, &cnt)) + if (kstrtoul(buf, 0, &cnt)) return -EINVAL; ret = crash_shrink_memory(cnt); diff --git a/kernel/params.c b/kernel/params.c index 501bde4f3bee..81c4e78c8f4c 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -253,13 +253,13 @@ int parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); +STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtoul); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); +STANDARD_PARAM_DEF(int, int, "%i", long, kstrtoul); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); +STANDARD_PARAM_DEF(long, long, "%li", long, kstrtoul); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); int param_set_charp(const char *val, const struct kernel_param *kp) { diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ff55247e7049..4aa8a305aede 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -17,8 +17,8 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); - counter->limit = RESOURCE_MAX; - counter->soft_limit = RESOURCE_MAX; + counter->limit = RES_COUNTER_MAX; + counter->soft_limit = RES_COUNTER_MAX; counter->parent = parent; } @@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member) #endif int res_counter_memparse_write_strategy(const char *buf, - unsigned long long *res) + unsigned long long *resp) { char *end; + unsigned long long res; - /* return RESOURCE_MAX(unlimited) if "-1" is specified */ + /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ if (*buf == '-') { - *res = simple_strtoull(buf + 1, &end, 10); - if (*res != 1 || *end != '\0') + res = simple_strtoull(buf + 1, &end, 10); + if (res != 1 || *end != '\0') return -EINVAL; - *res = RESOURCE_MAX; + *resp = RES_COUNTER_MAX; return 0; } - *res = memparse(buf, &end); + res = memparse(buf, &end); if (*end != '\0') return -EINVAL; - *res = PAGE_ALIGN(*res); + if (PAGE_ALIGN(res) >= res) + res = PAGE_ALIGN(res); + else + res = RES_COUNTER_MAX; + + *resp = res; + return 0; } diff --git a/mm/Kconfig b/mm/Kconfig index 6cdd27043303..026771a9b097 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -245,7 +245,7 @@ config COMPACTION config MIGRATION bool "Page migration" def_bool y - depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA + depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful in @@ -480,7 +480,7 @@ config FRONTSWAP config CMA bool "Contiguous Memory Allocator" - depends on HAVE_MEMBLOCK + depends on HAVE_MEMBLOCK && MMU select MIGRATION select MEMORY_ISOLATION help diff --git a/mm/filemap.c b/mm/filemap.c index e607728db4a8..1e6aec4a2d2e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -467,32 +467,34 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); if (error) - goto out; + return error; error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); - if (error == 0) { - page_cache_get(page); - page->mapping = mapping; - page->index = offset; - - spin_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); - if (likely(!error)) { - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - spin_unlock_irq(&mapping->tree_lock); - trace_mm_filemap_add_to_page_cache(page); - } else { - page->mapping = NULL; - /* Leave page->index set: truncation relies upon it */ - spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); - page_cache_release(page); - } - radix_tree_preload_end(); - } else + if (error) { mem_cgroup_uncharge_cache_page(page); -out: + return error; + } + + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + + spin_lock_irq(&mapping->tree_lock); + error = radix_tree_insert(&mapping->page_tree, offset, page); + radix_tree_preload_end(); + if (unlikely(error)) + goto err_insert; + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + spin_unlock_irq(&mapping->tree_lock); + trace_mm_filemap_add_to_page_cache(page); + return 0; +err_insert: + page->mapping = NULL; + /* Leave page->index set: truncation relies upon it */ + spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); return error; } EXPORT_SYMBOL(add_to_page_cache_locked); @@ -1614,6 +1616,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; + bool memcg_oom; pgoff_t size; int ret = 0; @@ -1622,7 +1625,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* - * Do we have something in the page cache already? + * Do we have something in the page cache already? Either + * way, try readahead, but disable the memcg OOM killer for it + * as readahead is optional and no errors are propagated up + * the fault stack. The OOM killer is enabled while trying to + * instantiate the faulting page individually below. */ page = find_get_page(mapping, offset); if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { @@ -1630,10 +1637,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ + memcg_oom = mem_cgroup_toggle_oom(false); do_async_mmap_readahead(vma, ra, file, page, offset); + mem_cgroup_toggle_oom(memcg_oom); } else if (!page) { /* No page in the page cache at all */ + memcg_oom = mem_cgroup_toggle_oom(false); do_sync_mmap_readahead(vma, ra, file, offset); + mem_cgroup_toggle_oom(memcg_oom); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d66010e0049d..7489884682d8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -695,11 +695,10 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } -static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) +static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) { pmd_t entry; - entry = mk_pmd(page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = mk_pmd(page, prot); entry = pmd_mkhuge(entry); return entry; } @@ -732,7 +731,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pte_free(mm, pgtable); } else { pmd_t entry; - entry = mk_huge_pmd(page, vma); + entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); @@ -788,77 +788,57 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *page; unsigned long haddr = address & HPAGE_PMD_MASK; - pte_t *pte; - if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma))) + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + return VM_FAULT_FALLBACK; + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; + if (!(flags & FAULT_FLAG_WRITE) && + transparent_hugepage_use_zero_page()) { + pgtable_t pgtable; + struct page *zero_page; + bool set; + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) return VM_FAULT_OOM; - if (!(flags & FAULT_FLAG_WRITE) && - transparent_hugepage_use_zero_page()) { - pgtable_t pgtable; - struct page *zero_page; - bool set; - pgtable = pte_alloc_one(mm, haddr); - if (unlikely(!pgtable)) - return VM_FAULT_OOM; - zero_page = get_huge_zero_page(); - if (unlikely(!zero_page)) { - pte_free(mm, pgtable); - count_vm_event(THP_FAULT_FALLBACK); - goto out; - } - spin_lock(&mm->page_table_lock); - set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, - zero_page); - spin_unlock(&mm->page_table_lock); - if (!set) { - pte_free(mm, pgtable); - put_huge_zero_page(); - } - return 0; - } - page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id(), 0); - if (unlikely(!page)) { + zero_page = get_huge_zero_page(); + if (unlikely(!zero_page)) { + pte_free(mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); - goto out; - } - count_vm_event(THP_FAULT_ALLOC); - if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { - put_page(page); - goto out; + return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, - page))) { - mem_cgroup_uncharge_page(page); - put_page(page); - goto out; + spin_lock(&mm->page_table_lock); + set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, + zero_page); + spin_unlock(&mm->page_table_lock); + if (!set) { + pte_free(mm, pgtable); + put_huge_zero_page(); } - return 0; } -out: - /* - * Use __pte_alloc instead of pte_alloc_map, because we can't - * run pte_offset_map on the pmd, if an huge pmd could - * materialize from under us from a different thread. - */ - if (unlikely(pmd_none(*pmd)) && - unlikely(__pte_alloc(mm, vma, pmd, address))) - return VM_FAULT_OOM; - /* if an huge pmd materialized from under us just retry later */ - if (unlikely(pmd_trans_huge(*pmd))) - return 0; - /* - * A regular pmd is established and it can't morph into a huge pmd - * from under us anymore at this point because we hold the mmap_sem - * read mode and khugepaged takes it in write mode. So now it's - * safe to run pte_offset_map(). - */ - pte = pte_offset_map(pmd, address); - return handle_pte_fault(mm, vma, address, pte, pmd, flags); + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr, numa_node_id(), 0); + if (unlikely(!page)) { + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { + mem_cgroup_uncharge_page(page); + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + + count_vm_event(THP_FAULT_ALLOC); + return 0; } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -1170,7 +1150,6 @@ alloc: new_page = NULL; if (unlikely(!new_page)) { - count_vm_event(THP_FAULT_FALLBACK); if (is_huge_zero_pmd(orig_pmd)) { ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, address, pmd, orig_pmd, haddr); @@ -1181,9 +1160,9 @@ alloc: split_huge_page(page); put_page(page); } + count_vm_event(THP_FAULT_FALLBACK); goto out; } - count_vm_event(THP_FAULT_ALLOC); if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { put_page(new_page); @@ -1191,10 +1170,13 @@ alloc: split_huge_page(page); put_page(page); } + count_vm_event(THP_FAULT_FALLBACK); ret |= VM_FAULT_OOM; goto out; } + count_vm_event(THP_FAULT_ALLOC); + if (is_huge_zero_pmd(orig_pmd)) clear_huge_page(new_page, haddr, HPAGE_PMD_NR); else @@ -1215,7 +1197,8 @@ alloc: goto out_mn; } else { pmd_t entry; - entry = mk_huge_pmd(new_page, vma); + entry = mk_huge_pmd(new_page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); @@ -1666,7 +1649,6 @@ static void __split_huge_page_refcount(struct page *page, BUG_ON(atomic_read(&page->_count) <= 0); __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); - __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); ClearPageCompound(page); compound_unlock(page); @@ -2364,7 +2346,8 @@ static void collapse_huge_page(struct mm_struct *mm, __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(new_page, vma); + _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); /* * spin_lock() below is not the equivalent of smp_wmb(), so diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c6bd28edd533..d5ff3ce13029 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -39,7 +39,6 @@ #include <linux/limits.h> #include <linux/export.h> #include <linux/mutex.h> -#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -85,26 +84,12 @@ static int really_do_swap_account __initdata = 0; #endif -/* - * Statistics for memory cgroup. - */ -enum mem_cgroup_stat_index { - /* - * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. - */ - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ - MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ - MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ - MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ - MEM_CGROUP_STAT_NSTATS, -}; - static const char * const mem_cgroup_stat_names[] = { "cache", "rss", "rss_huge", "mapped_file", + "writeback", "swap", }; @@ -175,10 +160,6 @@ struct mem_cgroup_per_zone { struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; - struct rb_node tree_node; /* RB tree node */ - unsigned long long usage_in_excess;/* Set to the value by which */ - /* the soft limit is exceeded*/ - bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; @@ -187,26 +168,6 @@ struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; -/* - * Cgroups above their limits are maintained in a RB-Tree, independent of - * their hierarchy representation - */ - -struct mem_cgroup_tree_per_zone { - struct rb_root rb_root; - spinlock_t lock; -}; - -struct mem_cgroup_tree_per_node { - struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; -}; - -struct mem_cgroup_tree { - struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; -}; - -static struct mem_cgroup_tree soft_limit_tree __read_mostly; - struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; u64 threshold; @@ -280,6 +241,7 @@ struct mem_cgroup { bool oom_lock; atomic_t under_oom; + atomic_t oom_wakeups; int swappiness; /* OOM-Killer disable */ @@ -304,7 +266,7 @@ struct mem_cgroup { * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? */ - unsigned long move_charge_at_immigrate; + unsigned long move_charge_at_immigrate; /* * set > 0 if pages under this cgroup are moving to other cgroup. */ @@ -341,6 +303,22 @@ struct mem_cgroup { atomic_t numainfo_events; atomic_t numainfo_updating; #endif + /* + * Protects soft_contributed transitions. + * See mem_cgroup_update_soft_limit + */ + spinlock_t soft_lock; + + /* + * If true then this group has increased parents' children_in_excess + * when it got over the soft limit. + * When a group falls bellow the soft limit, parents' children_in_excess + * is decreased and soft_contributed changed to false. + */ + bool soft_contributed; + + /* Number of children that are in soft limit excess */ + atomic_t children_in_excess; struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ @@ -444,7 +422,6 @@ static bool move_file(void) * limit reclaim to prevent infinite loops, if they ever occur. */ #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 -#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, @@ -671,164 +648,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) return mem_cgroup_zoneinfo(memcg, nid, zid); } -static struct mem_cgroup_tree_per_zone * -soft_limit_tree_node_zone(int nid, int zid) -{ - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; -} - -static struct mem_cgroup_tree_per_zone * -soft_limit_tree_from_page(struct page *page) -{ - int nid = page_to_nid(page); - int zid = page_zonenum(page); - - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; -} - -static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz, - unsigned long long new_usage_in_excess) -{ - struct rb_node **p = &mctz->rb_root.rb_node; - struct rb_node *parent = NULL; - struct mem_cgroup_per_zone *mz_node; - - if (mz->on_tree) - return; - - mz->usage_in_excess = new_usage_in_excess; - if (!mz->usage_in_excess) - return; - while (*p) { - parent = *p; - mz_node = rb_entry(parent, struct mem_cgroup_per_zone, - tree_node); - if (mz->usage_in_excess < mz_node->usage_in_excess) - p = &(*p)->rb_left; - /* - * We can't avoid mem cgroups that are over their soft - * limit by the same amount - */ - else if (mz->usage_in_excess >= mz_node->usage_in_excess) - p = &(*p)->rb_right; - } - rb_link_node(&mz->tree_node, parent, p); - rb_insert_color(&mz->tree_node, &mctz->rb_root); - mz->on_tree = true; -} - -static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) -{ - if (!mz->on_tree) - return; - rb_erase(&mz->tree_node, &mctz->rb_root); - mz->on_tree = false; -} - -static void -mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, - struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) -{ - spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(memcg, mz, mctz); - spin_unlock(&mctz->lock); -} - - -static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) -{ - unsigned long long excess; - struct mem_cgroup_per_zone *mz; - struct mem_cgroup_tree_per_zone *mctz; - int nid = page_to_nid(page); - int zid = page_zonenum(page); - mctz = soft_limit_tree_from_page(page); - - /* - * Necessary to update all ancestors when hierarchy is used. - * because their event counter is not touched. - */ - for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_zoneinfo(memcg, nid, zid); - excess = res_counter_soft_limit_excess(&memcg->res); - /* - * We have to update the tree if mz is on RB-tree or - * mem is over its softlimit. - */ - if (excess || mz->on_tree) { - spin_lock(&mctz->lock); - /* if on-tree, remove it */ - if (mz->on_tree) - __mem_cgroup_remove_exceeded(memcg, mz, mctz); - /* - * Insert again. mz->usage_in_excess will be updated. - * If excess is 0, no tree ops. - */ - __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); - spin_unlock(&mctz->lock); - } - } -} - -static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) -{ - int node, zone; - struct mem_cgroup_per_zone *mz; - struct mem_cgroup_tree_per_zone *mctz; - - for_each_node(node) { - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(memcg, node, zone); - mctz = soft_limit_tree_node_zone(node, zone); - mem_cgroup_remove_exceeded(memcg, mz, mctz); - } - } -} - -static struct mem_cgroup_per_zone * -__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) -{ - struct rb_node *rightmost = NULL; - struct mem_cgroup_per_zone *mz; - -retry: - mz = NULL; - rightmost = rb_last(&mctz->rb_root); - if (!rightmost) - goto done; /* Nothing to reclaim from */ - - mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); - /* - * Remove the node now but someone else can add it back, - * we will to add it back at the end of reclaim to its correct - * position in the tree. - */ - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); - if (!res_counter_soft_limit_excess(&mz->memcg->res) || - !css_tryget(&mz->memcg->css)) - goto retry; -done: - return mz; -} - -static struct mem_cgroup_per_zone * -mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) -{ - struct mem_cgroup_per_zone *mz; - - spin_lock(&mctz->lock); - mz = __mem_cgroup_largest_soft_limit_node(mctz); - spin_unlock(&mctz->lock); - return mz; -} - /* * Implementation Note: reading percpu statistics for memcg. * @@ -1003,6 +822,48 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, } /* + * Called from rate-limited memcg_check_events when enough + * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure + * that all the parents up the hierarchy will be notified that this group + * is in excess or that it is not in excess anymore. mmecg->soft_contributed + * makes the transition a single action whenever the state flips from one to + * the other. + */ +static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) +{ + unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); + struct mem_cgroup *parent = memcg; + int delta = 0; + + spin_lock(&memcg->soft_lock); + if (excess) { + if (!memcg->soft_contributed) { + delta = 1; + memcg->soft_contributed = true; + } + } else { + if (memcg->soft_contributed) { + delta = -1; + memcg->soft_contributed = false; + } + } + + /* + * Necessary to update all ancestors when hierarchy is used + * because their event counter is not touched. + * We track children even outside the hierarchy for the root + * cgroup because tree walk starting at root should visit + * all cgroups and we want to prevent from pointless tree + * walk if no children is below the limit. + */ + while (delta && (parent = parent_mem_cgroup(parent))) + atomic_add(delta, &parent->children_in_excess); + if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) + atomic_add(delta, &root_mem_cgroup->children_in_excess); + spin_unlock(&memcg->soft_lock); +} + +/* * Check events in order. * */ @@ -1025,7 +886,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) - mem_cgroup_update_tree(memcg, page); + mem_cgroup_update_soft_limit(memcg); #if MAX_NUMNODES > 1 if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); @@ -1068,6 +929,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } +static enum mem_cgroup_filter_t +mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, + mem_cgroup_iter_filter cond) +{ + if (!cond) + return VISIT; + return cond(memcg, root); +} + /* * Returns a next (in a pre-order walk) alive memcg (with elevated css * ref. count) or NULL if the whole root's subtree has been visited. @@ -1075,7 +945,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) * helper function to be used by mem_cgroup_iter */ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, - struct mem_cgroup *last_visited) + struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) { struct cgroup_subsys_state *prev_css, *next_css; @@ -1093,11 +963,31 @@ skip_node: if (next_css) { struct mem_cgroup *mem = mem_cgroup_from_css(next_css); - if (css_tryget(&mem->css)) - return mem; - else { + switch (mem_cgroup_filter(mem, root, cond)) { + case SKIP: prev_css = next_css; goto skip_node; + case SKIP_TREE: + if (mem == root) + return NULL; + /* + * css_rightmost_descendant is not an optimal way to + * skip through a subtree (especially for imbalanced + * trees leaning to right) but that's what we have right + * now. More effective solution would be traversing + * right-up for first non-NULL without calling + * css_next_descendant_pre afterwards. + */ + prev_css = css_rightmost_descendant(next_css); + goto skip_node; + case VISIT: + if (css_tryget(&mem->css)) + return mem; + else { + prev_css = next_css; + goto skip_node; + } + break; } } @@ -1161,6 +1051,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation * @reclaim: cookie for shared reclaim walks, NULL for full walks + * @cond: filter for visited nodes, NULL for no filter * * Returns references to children of the hierarchy below @root, or * @root itself, or %NULL after a full round-trip. @@ -1173,15 +1064,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, * divide up the memcgs in the hierarchy among all concurrent * reclaimers operating on the same zone and priority. */ -struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, +struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, struct mem_cgroup *prev, - struct mem_cgroup_reclaim_cookie *reclaim) + struct mem_cgroup_reclaim_cookie *reclaim, + mem_cgroup_iter_filter cond) { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; - if (mem_cgroup_disabled()) - return NULL; + if (mem_cgroup_disabled()) { + /* first call must return non-NULL, second return NULL */ + return (struct mem_cgroup *)(unsigned long)!prev; + } if (!root) root = root_mem_cgroup; @@ -1192,7 +1086,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) goto out_css_put; - return root; + if (mem_cgroup_filter(root, root, cond) == VISIT) + return root; + return NULL; } rcu_read_lock(); @@ -1215,7 +1111,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, last_visited = mem_cgroup_iter_load(iter, root, &seq); } - memcg = __mem_cgroup_iter_next(root, last_visited); + memcg = __mem_cgroup_iter_next(root, last_visited, cond); if (reclaim) { mem_cgroup_iter_update(iter, last_visited, memcg, seq); @@ -1226,7 +1122,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, reclaim->generation = iter->generation; } - if (prev && !memcg) + /* + * We have finished the whole tree walk or no group has been + * visited because filter told us to skip the root node. + */ + if (!memcg && (prev || (cond && !last_visited))) goto out_unlock; } out_unlock: @@ -1867,6 +1767,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, return total; } +#if MAX_NUMNODES > 1 /** * test_mem_cgroup_node_reclaimable * @memcg: the target memcg @@ -1889,7 +1790,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, return false; } -#if MAX_NUMNODES > 1 /* * Always updating the nodemask is not very good - even if we have an empty @@ -1957,115 +1857,64 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) return node; } -/* - * Check all nodes whether it contains reclaimable pages or not. - * For quick scan, we make use of scan_nodes. This will allow us to skip - * unused nodes. But scan_nodes is lazily updated and may not cotain - * enough new information. We need to do double check. - */ -static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) -{ - int nid; - - /* - * quick check...making use of scan_node. - * We can skip unused nodes. - */ - if (!nodes_empty(memcg->scan_nodes)) { - for (nid = first_node(memcg->scan_nodes); - nid < MAX_NUMNODES; - nid = next_node(nid, memcg->scan_nodes)) { - - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) - return true; - } - } - /* - * Check rest of nodes. - */ - for_each_node_state(nid, N_MEMORY) { - if (node_isset(nid, memcg->scan_nodes)) - continue; - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) - return true; - } - return false; -} - #else int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { return 0; } -static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) -{ - return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); -} #endif -static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, - struct zone *zone, - gfp_t gfp_mask, - unsigned long *total_scanned) -{ - struct mem_cgroup *victim = NULL; - int total = 0; - int loop = 0; - unsigned long excess; - unsigned long nr_scanned; - struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, - .priority = 0, - }; - - excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; - - while (1) { - victim = mem_cgroup_iter(root_memcg, victim, &reclaim); - if (!victim) { - loop++; - if (loop >= 2) { - /* - * If we have not been able to reclaim - * anything, it might because there are - * no reclaimable pages under this hierarchy - */ - if (!total) - break; - /* - * We want to do more targeted reclaim. - * excess >> 2 is not to excessive so as to - * reclaim too much, nor too less that we keep - * coming back to reclaim from this cgroup - */ - if (total >= (excess >> 2) || - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) - break; - } - continue; - } - if (!mem_cgroup_reclaimable(victim, false)) - continue; - total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, - zone, &nr_scanned); - *total_scanned += nr_scanned; - if (!res_counter_soft_limit_excess(&root_memcg->res)) +/* + * A group is eligible for the soft limit reclaim under the given root + * hierarchy if + * a) it is over its soft limit + * b) any parent up the hierarchy is over its soft limit + * + * If the given group doesn't have any children over the limit then it + * doesn't make any sense to iterate its subtree. + */ +enum mem_cgroup_filter_t +mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, + struct mem_cgroup *root) +{ + struct mem_cgroup *parent; + + if (!memcg) + memcg = root_mem_cgroup; + parent = memcg; + + if (res_counter_soft_limit_excess(&memcg->res)) + return VISIT; + + /* + * If any parent up to the root in the hierarchy is over its soft limit + * then we have to obey and reclaim from this group as well. + */ + while ((parent = parent_mem_cgroup(parent))) { + if (res_counter_soft_limit_excess(&parent->res)) + return VISIT; + if (parent == root) break; } - mem_cgroup_iter_break(root_memcg, victim); - return total; + + if (!atomic_read(&memcg->children_in_excess)) + return SKIP_TREE; + return SKIP; } +static DEFINE_SPINLOCK(memcg_oom_lock); + /* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. - * Has to be called with memcg_oom_lock */ -static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) +static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) { struct mem_cgroup *iter, *failed = NULL; + spin_lock(&memcg_oom_lock); + for_each_mem_cgroup_tree(iter, memcg) { if (iter->oom_lock) { /* @@ -2079,33 +1928,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) iter->oom_lock = true; } - if (!failed) - return true; - - /* - * OK, we failed to lock the whole subtree so we have to clean up - * what we set up to the failing subtree - */ - for_each_mem_cgroup_tree(iter, memcg) { - if (iter == failed) { - mem_cgroup_iter_break(memcg, iter); - break; + if (failed) { + /* + * OK, we failed to lock the whole subtree so we have + * to clean up what we set up to the failing subtree + */ + for_each_mem_cgroup_tree(iter, memcg) { + if (iter == failed) { + mem_cgroup_iter_break(memcg, iter); + break; + } + iter->oom_lock = false; } - iter->oom_lock = false; } - return false; + + spin_unlock(&memcg_oom_lock); + + return !failed; } -/* - * Has to be called with memcg_oom_lock - */ -static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) +static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) { struct mem_cgroup *iter; + spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) iter->oom_lock = false; - return 0; + spin_unlock(&memcg_oom_lock); } static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) @@ -2129,7 +1978,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) atomic_add_unless(&iter->under_oom, -1, 0); } -static DEFINE_SPINLOCK(memcg_oom_lock); static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); struct oom_wait_info { @@ -2159,6 +2007,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait, static void memcg_wakeup_oom(struct mem_cgroup *memcg) { + atomic_inc(&memcg->oom_wakeups); /* for filtering, pass "memcg" as argument. */ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } @@ -2170,56 +2019,136 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) } /* - * try to call OOM killer. returns false if we should exit memory-reclaim loop. + * try to call OOM killer */ -static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, - int order) +static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - struct oom_wait_info owait; - bool locked, need_to_kill; + bool locked; + int wakeups; - owait.memcg = memcg; - owait.wait.flags = 0; - owait.wait.func = memcg_oom_wake_function; - owait.wait.private = current; - INIT_LIST_HEAD(&owait.wait.task_list); - need_to_kill = true; - mem_cgroup_mark_under_oom(memcg); + if (!current->memcg_oom.may_oom) + return; + + current->memcg_oom.in_memcg_oom = 1; - /* At first, try to OOM lock hierarchy under memcg.*/ - spin_lock(&memcg_oom_lock); - locked = mem_cgroup_oom_lock(memcg); /* - * Even if signal_pending(), we can't quit charge() loop without - * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL - * under OOM is always welcomed, use TASK_KILLABLE here. + * As with any blocking lock, a contender needs to start + * listening for wakeups before attempting the trylock, + * otherwise it can miss the wakeup from the unlock and sleep + * indefinitely. This is just open-coded because our locking + * is so particular to memcg hierarchies. */ - prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); - if (!locked || memcg->oom_kill_disable) - need_to_kill = false; + wakeups = atomic_read(&memcg->oom_wakeups); + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + if (locked) mem_cgroup_oom_notify(memcg); - spin_unlock(&memcg_oom_lock); - if (need_to_kill) { - finish_wait(&memcg_oom_waitq, &owait.wait); + if (locked && !memcg->oom_kill_disable) { + mem_cgroup_unmark_under_oom(memcg); mem_cgroup_out_of_memory(memcg, mask, order); + mem_cgroup_oom_unlock(memcg); + /* + * There is no guarantee that an OOM-lock contender + * sees the wakeups triggered by the OOM kill + * uncharges. Wake any sleepers explicitely. + */ + memcg_oom_recover(memcg); } else { - schedule(); - finish_wait(&memcg_oom_waitq, &owait.wait); + /* + * A system call can just return -ENOMEM, but if this + * is a page fault and somebody else is handling the + * OOM already, we need to sleep on the OOM waitqueue + * for this memcg until the situation is resolved. + * Which can take some time because it might be + * handled by a userspace task. + * + * However, this is the charge context, which means + * that we may sit on a large call stack and hold + * various filesystem locks, the mmap_sem etc. and we + * don't want the OOM handler to deadlock on them + * while we sit here and wait. Store the current OOM + * context in the task_struct, then return -ENOMEM. + * At the end of the page fault handler, with the + * stack unwound, pagefault_out_of_memory() will check + * back with us by calling + * mem_cgroup_oom_synchronize(), possibly putting the + * task to sleep. + */ + current->memcg_oom.oom_locked = locked; + current->memcg_oom.wakeups = wakeups; + css_get(&memcg->css); + current->memcg_oom.wait_on_memcg = memcg; } - spin_lock(&memcg_oom_lock); - if (locked) - mem_cgroup_oom_unlock(memcg); - memcg_wakeup_oom(memcg); - spin_unlock(&memcg_oom_lock); +} - mem_cgroup_unmark_under_oom(memcg); +/** + * mem_cgroup_oom_synchronize - complete memcg OOM handling + * + * This has to be called at the end of a page fault if the the memcg + * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. + * + * Memcg supports userspace OOM handling, so failed allocations must + * sleep on a waitqueue until the userspace task resolves the + * situation. Sleeping directly in the charge context with all kinds + * of locks held is not a good idea, instead we remember an OOM state + * in the task and mem_cgroup_oom_synchronize() has to be called at + * the end of the page fault to put the task to sleep and clean up the + * OOM state. + * + * Returns %true if an ongoing memcg OOM situation was detected and + * finalized, %false otherwise. + */ +bool mem_cgroup_oom_synchronize(void) +{ + struct oom_wait_info owait; + struct mem_cgroup *memcg; - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) + /* OOM is global, do not handle */ + if (!current->memcg_oom.in_memcg_oom) return false; - /* Give chance to dying process */ - schedule_timeout_uninterruptible(1); + + /* + * We invoked the OOM killer but there is a chance that a kill + * did not free up any charges. Everybody else might already + * be sleeping, so restart the fault and keep the rampage + * going until some charges are released. + */ + memcg = current->memcg_oom.wait_on_memcg; + if (!memcg) + goto out; + + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) + goto out_memcg; + + owait.memcg = memcg; + owait.wait.flags = 0; + owait.wait.func = memcg_oom_wake_function; + owait.wait.private = current; + INIT_LIST_HEAD(&owait.wait.task_list); + + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + /* Only sleep if we didn't miss any wakeups since OOM */ + if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) + schedule(); + finish_wait(&memcg_oom_waitq, &owait.wait); +out_memcg: + mem_cgroup_unmark_under_oom(memcg); + if (current->memcg_oom.oom_locked) { + mem_cgroup_oom_unlock(memcg); + /* + * There is no guarantee that an OOM-lock contender + * sees the wakeups triggered by the OOM kill + * uncharges. Wake any sleepers explicitely. + */ + memcg_oom_recover(memcg); + } + css_put(&memcg->css); + current->memcg_oom.wait_on_memcg = NULL; +out: + current->memcg_oom.in_memcg_oom = 0; return true; } @@ -2288,7 +2217,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) } void mem_cgroup_update_page_stat(struct page *page, - enum mem_cgroup_page_stat_item idx, int val) + enum mem_cgroup_stat_index idx, int val) { struct mem_cgroup *memcg; struct page_cgroup *pc = lookup_page_cgroup(page); @@ -2297,18 +2226,11 @@ void mem_cgroup_update_page_stat(struct page *page, if (mem_cgroup_disabled()) return; + VM_BUG_ON(!rcu_read_lock_held()); memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) return; - switch (idx) { - case MEMCG_NR_FILE_MAPPED: - idx = MEM_CGROUP_STAT_FILE_MAPPED; - break; - default: - BUG(); - } - this_cpu_add(memcg->stat->count[idx], val); } @@ -2450,7 +2372,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) flush_work(&stock->work); } out: - put_online_cpus(); + put_online_cpus(); } /* @@ -2532,12 +2454,11 @@ enum { CHARGE_RETRY, /* need to retry but retry is not bad */ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ - CHARGE_OOM_DIE, /* the current is killed because of OOM */ }; static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages, unsigned int min_pages, - bool oom_check) + bool invoke_oom) { unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; @@ -2594,14 +2515,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (mem_cgroup_wait_acct_move(mem_over_limit)) return CHARGE_RETRY; - /* If we don't need to call oom-killer at el, return immediately */ - if (!oom_check) - return CHARGE_NOMEM; - /* check OOM */ - if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) - return CHARGE_OOM_DIE; + if (invoke_oom) + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - return CHARGE_RETRY; + return CHARGE_NOMEM; } /* @@ -2704,7 +2621,7 @@ again: } do { - bool oom_check; + bool invoke_oom = oom && !nr_oom_retries; /* If killed, bypass charge */ if (fatal_signal_pending(current)) { @@ -2712,14 +2629,8 @@ again: goto bypass; } - oom_check = false; - if (oom && !nr_oom_retries) { - oom_check = true; - nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - } - - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, - oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, + nr_pages, invoke_oom); switch (ret) { case CHARGE_OK: break; @@ -2732,16 +2643,12 @@ again: css_put(&memcg->css); goto nomem; case CHARGE_NOMEM: /* OOM routine works */ - if (!oom) { + if (!oom || invoke_oom) { css_put(&memcg->css); goto nomem; } - /* If oom, we never return -ENOMEM */ nr_oom_retries--; break; - case CHARGE_OOM_DIE: /* Killed by OOM Killer */ - css_put(&memcg->css); - goto bypass; } } while (ret != CHARGE_OK); @@ -2882,7 +2789,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, * is accessed after testing USED bit. To make pc->mem_cgroup visible * before USED bit, we need memory barrier here. * See mem_cgroup_add_lru_list(), etc. - */ + */ smp_wmb(); SetPageCgroupUsed(pc); @@ -2905,9 +2812,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, unlock_page_cgroup(pc); /* - * "charge_statistics" updated event counter. Then, check it. - * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. - * if they exceeds softlimit. + * "charge_statistics" updated event counter. */ memcg_check_events(memcg, page); } @@ -3626,9 +3531,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) * the page allocator. Therefore, the following sequence when backed by * the SLUB allocator: * - * memcg_stop_kmem_account(); - * kmalloc(<large_number>) - * memcg_resume_kmem_account(); + * memcg_stop_kmem_account(); + * kmalloc(<large_number>) + * memcg_resume_kmem_account(); * * would effectively ignore the fact that we should skip accounting, * since it will drive us directly to this function without passing @@ -3750,6 +3655,20 @@ void mem_cgroup_split_huge_fixup(struct page *head) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline +void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, + struct mem_cgroup *to, + unsigned int nr_pages, + enum mem_cgroup_stat_index idx) +{ + /* Update stat data for mem_cgroup */ + preempt_disable(); + WARN_ON_ONCE(from->stat->count[idx] < nr_pages); + __this_cpu_add(from->stat->count[idx], -nr_pages); + __this_cpu_add(to->stat->count[idx], nr_pages); + preempt_enable(); +} + /** * mem_cgroup_move_account - move account of the page * @page: the page @@ -3795,13 +3714,14 @@ static int mem_cgroup_move_account(struct page *page, move_lock_mem_cgroup(from, &flags); - if (!anon && page_mapped(page)) { - /* Update mapped_file data for mem_cgroup */ - preempt_disable(); - __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); - __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); - preempt_enable(); - } + if (!anon && page_mapped(page)) + mem_cgroup_move_account_page_stat(from, to, nr_pages, + MEM_CGROUP_STAT_FILE_MAPPED); + + if (PageWriteback(page)) + mem_cgroup_move_account_page_stat(from, to, nr_pages, + MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_charge_statistics(from, page, anon, -nr_pages); /* caller should have done css_get */ @@ -4657,7 +4577,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ - if (curusage >= oldusage) + if (curusage >= oldusage) retry_count--; else oldusage = curusage; @@ -4678,7 +4598,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, int enlarge = 0; /* see mem_cgroup_resize_res_limit */ - retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; + retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); while (retry_count) { if (signal_pending(current)) { @@ -4727,98 +4647,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, - unsigned long *total_scanned) -{ - unsigned long nr_reclaimed = 0; - struct mem_cgroup_per_zone *mz, *next_mz = NULL; - unsigned long reclaimed; - int loop = 0; - struct mem_cgroup_tree_per_zone *mctz; - unsigned long long excess; - unsigned long nr_scanned; - - if (order > 0) - return 0; - - mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); - /* - * This loop can run a while, specially if mem_cgroup's continuously - * keep exceeding their soft limit and putting the system under - * pressure - */ - do { - if (next_mz) - mz = next_mz; - else - mz = mem_cgroup_largest_soft_limit_node(mctz); - if (!mz) - break; - - nr_scanned = 0; - reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, - gfp_mask, &nr_scanned); - nr_reclaimed += reclaimed; - *total_scanned += nr_scanned; - spin_lock(&mctz->lock); - - /* - * If we failed to reclaim anything from this memory cgroup - * it is time to move on to the next cgroup - */ - next_mz = NULL; - if (!reclaimed) { - do { - /* - * Loop until we find yet another one. - * - * By the time we get the soft_limit lock - * again, someone might have aded the - * group back on the RB tree. Iterate to - * make sure we get a different mem. - * mem_cgroup_largest_soft_limit_node returns - * NULL if no other cgroup is present on - * the tree - */ - next_mz = - __mem_cgroup_largest_soft_limit_node(mctz); - if (next_mz == mz) - css_put(&next_mz->memcg->css); - else /* next_mz == NULL or other memcg */ - break; - } while (1); - } - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); - excess = res_counter_soft_limit_excess(&mz->memcg->res); - /* - * One school of thought says that we should not add - * back the node to the tree if reclaim returns 0. - * But our reclaim could return 0, simply because due - * to priority we are exposing a smaller subset of - * memory to reclaim from. Consider this as a longer - * term TODO. - */ - /* If excess == 0, no tree ops */ - __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); - spin_unlock(&mctz->lock); - css_put(&mz->memcg->css); - loop++; - /* - * Could not reclaim anything and there are no more - * mem cgroups to try or we seem to be looping without - * reclaiming anything. - */ - if (!nr_reclaimed && - (next_mz == NULL || - loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) - break; - } while (!nr_reclaimed); - if (next_mz) - css_put(&next_mz->memcg->css); - return nr_reclaimed; -} - /** * mem_cgroup_force_empty_list - clears LRU of a group * @memcg: group to clear @@ -4990,18 +4818,12 @@ static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, unsigned int event) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - int ret; if (mem_cgroup_is_root(memcg)) return -EINVAL; - css_get(&memcg->css); - ret = mem_cgroup_force_empty(memcg); - css_put(&memcg->css); - - return ret; + return mem_cgroup_force_empty(memcg); } - static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -5139,7 +4961,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) */ mutex_lock(&memcg_create_mutex); mutex_lock(&set_limit_mutex); - if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { + if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { ret = -EBUSY; goto out; @@ -5149,7 +4971,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) ret = memcg_update_cache_sizes(memcg); if (ret) { - res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); + res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); goto out; } static_key_slow_inc(&memcg_kmem_enabled_key); @@ -6089,8 +5911,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; lruvec_init(&mz->lruvec); - mz->usage_in_excess = 0; - mz->on_tree = false; mz->memcg = memcg; } memcg->nodeinfo[node] = pn; @@ -6146,7 +5966,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) int node; size_t size = memcg_size(); - mem_cgroup_remove_from_trees(memcg); free_css_id(&mem_cgroup_subsys, &memcg->css); for_each_node(node) @@ -6183,29 +6002,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(parent_mem_cgroup); -static void __init mem_cgroup_soft_limit_tree_init(void) -{ - struct mem_cgroup_tree_per_node *rtpn; - struct mem_cgroup_tree_per_zone *rtpz; - int tmp, node, zone; - - for_each_node(node) { - tmp = node; - if (!node_state(node, N_NORMAL_MEMORY)) - tmp = -1; - rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); - BUG_ON(!rtpn); - - soft_limit_tree.rb_tree_per_node[node] = rtpn; - - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - rtpz = &rtpn->rb_tree_per_zone[zone]; - rtpz->rb_root = RB_ROOT; - spin_lock_init(&rtpz->lock); - } - } -} - static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -6235,6 +6031,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); + spin_lock_init(&memcg->soft_lock); return &memcg->css; @@ -6312,6 +6109,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); + if (memcg->soft_contributed) { + while ((memcg = parent_mem_cgroup(memcg))) + atomic_dec(&memcg->children_in_excess); + + if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) + atomic_dec(&root_mem_cgroup->children_in_excess); + } mem_cgroup_destroy_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } @@ -6986,7 +6790,6 @@ static int __init mem_cgroup_init(void) { hotcpu_notifier(memcg_cpu_hotplug_callback, 0); enable_swap_cgroup(); - mem_cgroup_soft_limit_tree_init(); memcg_stock_init(); return 0; } diff --git a/mm/memory.c b/mm/memory.c index 2b73dbde2274..ca0003947115 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3695,7 +3695,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -int handle_pte_fault(struct mm_struct *mm, +static int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, unsigned int flags) { @@ -3754,22 +3754,14 @@ unlock: /* * By the time we get here, we already hold the mm semaphore */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - __set_current_state(TASK_RUNNING); - - count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(mm, PGFAULT); - - /* do counter updates before entering really critical section. */ - check_sync_rss_stat(current); - if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); @@ -3782,9 +3774,12 @@ retry: if (!pmd) return VM_FAULT_OOM; if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + int ret = VM_FAULT_FALLBACK; if (!vma->vm_ops) - return do_huge_pmd_anonymous_page(mm, vma, address, - pmd, flags); + ret = do_huge_pmd_anonymous_page(mm, vma, address, + pmd, flags); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; } else { pmd_t orig_pmd = *pmd; int ret; @@ -3850,6 +3845,37 @@ retry: return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + int ret; + + __set_current_state(TASK_RUNNING); + + count_vm_event(PGFAULT); + mem_cgroup_count_vm_event(mm, PGFAULT); + + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + + /* + * Enable the memcg OOM handling for faults triggered in user + * space. Kernel faults are handled more gracefully. + */ + if (flags & FAULT_FLAG_USER) + mem_cgroup_enable_oom(); + + ret = __handle_mm_fault(mm, vma, address, flags); + + if (flags & FAULT_FLAG_USER) + mem_cgroup_disable_oom(); + + if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) + mem_cgroup_oom_synchronize(); + + return ret; +} + #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 98e75f2ac7bc..314e9d274381 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -678,9 +678,12 @@ out: */ void pagefault_out_of_memory(void) { - struct zonelist *zonelist = node_zonelist(first_online_node, - GFP_KERNEL); + struct zonelist *zonelist; + if (mem_cgroup_oom_synchronize()) + return; + + zonelist = node_zonelist(first_online_node, GFP_KERNEL); if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { out_of_memory(NULL, 0, 0, NULL, false); clear_zonelist_oom(zonelist, GFP_KERNEL); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6c7b0187be8e..f5236f804aa6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2143,11 +2143,17 @@ EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for set_page_writeback family. + * + * The caller must hold mem_cgroup_begin/end_update_page_stat() lock + * while calling this function. + * See test_set_page_writeback for example. + * * NOTE: Unlike account_page_dirtied this does not rely on being atomic * wrt interrupts. */ void account_page_writeback(struct page *page) { + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } EXPORT_SYMBOL(account_page_writeback); @@ -2364,7 +2370,10 @@ int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); int ret; + bool locked; + unsigned long memcg_flags; + mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2385,9 +2394,11 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } + mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); return ret; } @@ -2395,7 +2406,10 @@ int test_set_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); int ret; + bool locked; + unsigned long memcg_flags; + mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2422,6 +2436,7 @@ int test_set_page_writeback(struct page *page) } if (!ret) account_page_writeback(page); + mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 07748e68b729..fd3ee7a54a13 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1052,11 +1052,11 @@ void do_page_add_anon_rmap(struct page *page, { int first = atomic_inc_and_test(&page->_mapcount); if (first) { - if (!PageTransHuge(page)) - __inc_zone_page_state(page, NR_ANON_PAGES); - else + if (PageTransHuge(page)) __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, + hpage_nr_pages(page)); } if (unlikely(PageKsm(page))) return; @@ -1085,10 +1085,10 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - if (!PageTransHuge(page)) - __inc_zone_page_state(page, NR_ANON_PAGES); - else + if (PageTransHuge(page)) __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, + hpage_nr_pages(page)); __page_set_anon_rmap(page, vma, address, 1); if (!mlocked_vma_newpage(vma, page)) { SetPageActive(page); @@ -1111,7 +1111,7 @@ void page_add_file_rmap(struct page *page) mem_cgroup_begin_update_page_stat(page, &locked, &flags); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); } mem_cgroup_end_update_page_stat(page, &locked, &flags); } @@ -1148,14 +1148,14 @@ void page_remove_rmap(struct page *page) goto out; if (anon) { mem_cgroup_uncharge_page(page); - if (!PageTransHuge(page)) - __dec_zone_page_state(page, NR_ANON_PAGES); - else + if (PageTransHuge(page)) __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, + -hpage_nr_pages(page)); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); mem_cgroup_end_update_page_stat(page, &locked, &flags); } if (unlikely(PageMlocked(page))) diff --git a/mm/swap.c b/mm/swap.c index c899502d3e36..759c3caf44bd 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -432,6 +432,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -449,6 +454,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -701,12 +711,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) lru_add_drain(); } -/* - * Returns 0 for success - */ -int lru_add_drain_all(void) +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); + +void lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + static DEFINE_MUTEX(lock); + static struct cpumask has_work; + int cpu; + + mutex_lock(&lock); + get_online_cpus(); + cpumask_clear(&has_work); + + for_each_online_cpu(cpu) { + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); + + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) { + INIT_WORK(work, lru_add_drain_per_cpu); + schedule_work_on(cpu, work); + cpumask_set_cpu(cpu, &has_work); + } + } + + for_each_cpu(cpu, &has_work) + flush_work(&per_cpu(lru_add_drain_work, cpu)); + + put_online_cpus(); + mutex_unlock(&lock); } /* diff --git a/mm/truncate.c b/mm/truncate.c index e2e8a8a7eb9d..353b683afd6e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -567,7 +567,6 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); /** * truncate_pagecache - unmap and remove pagecache that has been truncated * @inode: inode - * @oldsize: old file size * @newsize: new file size * * inode's new i_size must already be written before truncate_pagecache @@ -580,7 +579,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ -void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) +void truncate_pagecache(struct inode *inode, loff_t newsize) { struct address_space *mapping = inode->i_mapping; loff_t holebegin = round_up(newsize, PAGE_SIZE); @@ -614,12 +613,8 @@ EXPORT_SYMBOL(truncate_pagecache); */ void truncate_setsize(struct inode *inode, loff_t newsize) { - loff_t oldsize; - - oldsize = inode->i_size; i_size_write(inode, newsize); - - truncate_pagecache(inode, oldsize, newsize); + truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); diff --git a/mm/vmscan.c b/mm/vmscan.c index beb35778c69f..8ed1b775bdc9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -139,11 +139,23 @@ static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } + +static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) +{ + struct mem_cgroup *root = sc->target_mem_cgroup; + return !mem_cgroup_disabled() && + mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; +} #else static bool global_reclaim(struct scan_control *sc) { return true; } + +static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) +{ + return false; +} #endif unsigned long zone_reclaimable_pages(struct zone *zone) @@ -2164,9 +2176,11 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +static int +__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) { unsigned long nr_reclaimed, nr_scanned; + int groups_scanned = 0; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2174,15 +2188,17 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) .zone = zone, .priority = sc->priority, }; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; + mem_cgroup_iter_filter filter = (soft_reclaim) ? + mem_cgroup_soft_reclaim_eligible : NULL; nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - memcg = mem_cgroup_iter(root, NULL, &reclaim); - do { + while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { struct lruvec *lruvec; + groups_scanned++; lruvec = mem_cgroup_zone_lruvec(zone, memcg); shrink_lruvec(lruvec, sc); @@ -2202,8 +2218,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) mem_cgroup_iter_break(root, memcg); break; } - memcg = mem_cgroup_iter(root, memcg, &reclaim); - } while (memcg); + } vmpressure(sc->gfp_mask, sc->target_mem_cgroup, sc->nr_scanned - nr_scanned, @@ -2211,6 +2226,37 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + return groups_scanned; +} + + +static void shrink_zone(struct zone *zone, struct scan_control *sc) +{ + bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); + unsigned long nr_scanned = sc->nr_scanned; + int scanned_groups; + + scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); + /* + * memcg iterator might race with other reclaimer or start from + * a incomplete tree walk so the tree walk in __shrink_zone + * might have missed groups that are above the soft limit. Try + * another loop to catch up with others. Do it just once to + * prevent from reclaim latencies when other reclaimers always + * preempt this one. + */ + if (do_soft_reclaim && !scanned_groups) + __shrink_zone(zone, sc, do_soft_reclaim); + + /* + * No group is over the soft limit or those that are do not have + * pages in the zone we are reclaiming so we have to reclaim everybody + */ + if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { + __shrink_zone(zone, sc, false); + return; + } } /* Returns true if compaction should go ahead for a high-order request */ @@ -2274,8 +2320,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; - unsigned long nr_soft_reclaimed; - unsigned long nr_soft_scanned; bool aborted_reclaim = false; /* @@ -2315,18 +2359,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; } } - /* - * This steals pages from memory cgroups over softlimit - * and returns the number of reclaimed pages and - * scanned pages. This works for global memory pressure - * and balancing, not for a memcg's limit. - */ - nr_soft_scanned = 0; - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, - sc->order, sc->gfp_mask, - &nr_soft_scanned); - sc->nr_reclaimed += nr_soft_reclaimed; - sc->nr_scanned += nr_soft_scanned; /* need some check for avoid more shrink_zone() */ } @@ -2920,8 +2952,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - unsigned long nr_soft_reclaimed; - unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .priority = DEF_PRIORITY, @@ -3036,15 +3066,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, sc.nr_scanned = 0; - nr_soft_scanned = 0; - /* - * Call soft limit reclaim before calling shrink_zone. - */ - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, - order, sc.gfp_mask, - &nr_soft_scanned); - sc.nr_reclaimed += nr_soft_reclaimed; - /* * There should be no need to raise the scanning * priority if enough pages are already being scanned diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 8a57d79b0b16..559d4ae6ebf4 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -87,8 +87,8 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) if (!cg_proto) return -EINVAL; - if (val > RESOURCE_MAX) - val = RESOURCE_MAX; + if (val > RES_COUNTER_MAX) + val = RES_COUNTER_MAX; tcp = tcp_from_cgproto(cg_proto); @@ -101,9 +101,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, net->ipv4.sysctl_tcp_mem[i]); - if (val == RESOURCE_MAX) + if (val == RES_COUNTER_MAX) clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - else if (val != RESOURCE_MAX) { + else if (val != RES_COUNTER_MAX) { /* * The active bit needs to be written after the static_key * update. This is what guarantees that the socket activation @@ -187,7 +187,7 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) switch (cft->private) { case RES_LIMIT: - val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX); + val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX); break; case RES_USAGE: val = tcp_read_usage(memcg); |