diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2014-04-03 14:48:13 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-04-03 16:21:03 -0700 |
commit | f0c6d4d295e4ea9a47375304420baa38ca279542 (patch) | |
tree | dde84d183f17f94772500f408177486d509d85c7 /mm | |
parent | ec47c3b9543054f6f255d027100fa8214e637003 (diff) | |
download | lwn-f0c6d4d295e4ea9a47375304420baa38ca279542.tar.gz lwn-f0c6d4d295e4ea9a47375304420baa38ca279542.zip |
mm: introduce do_shared_fault() and drop do_fault()
Introduce do_shared_fault(). The function does what do_fault() does for
write faults to shared mappings
Unlike do_fault(), do_shared_fault() is relatively clean and
straight-forward.
Old do_fault() is not needed anymore. Let it die.
[lliubbo@gmail.com: fix NULL pointer dereference]
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memory.c | 226 |
1 files changed, 62 insertions, 164 deletions
diff --git a/mm/memory.c b/mm/memory.c index 5be13e794a7c..d4320e42989d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2748,7 +2748,7 @@ reuse: * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * - * do_fault is protected similarly. + * do_shared_fault is protected similarly. */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); @@ -3410,188 +3410,86 @@ uncharge_out: return ret; } -/* - * do_fault() tries to create a new page mapping. It aggressively - * tries to share with existing pages, but makes a separate copy if - * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid - * the next page fault. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte neither mapped nor locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - */ -static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { - pte_t *page_table; + struct page *fault_page; + struct address_space *mapping; spinlock_t *ptl; - struct page *page, *fault_page; - struct page *cow_page; - pte_t entry; - int anon = 0; - struct page *dirty_page = NULL; - int ret; - int page_mkwrite = 0; - - /* - * If we do COW later, allocate page befor taking lock_page() - * on the file cache page. This will reduce lock holding time. - */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; - - cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!cow_page) - return VM_FAULT_OOM; - - if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { - page_cache_release(cow_page); - return VM_FAULT_OOM; - } - } else - cow_page = NULL; + pte_t entry, *pte; + int dirtied = 0; + struct vm_fault vmf; + int ret, tmp; ret = __do_fault(vma, address, pgoff, flags, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - goto uncharge_out; + return ret; /* - * Should we do an early C-O-W break? + * Check if the backing address space wants to know that the page is + * about to become writable */ - page = fault_page; - if (flags & FAULT_FLAG_WRITE) { - if (!(vma->vm_flags & VM_SHARED)) { - page = cow_page; - anon = 1; - copy_user_highpage(page, fault_page, address, vma); - __SetPageUptodate(page); - } else { - /* - * If the page will be shareable, see if the backing - * address space wants to know that the page is about - * to become writable - */ - if (vma->vm_ops->page_mkwrite) { - struct vm_fault vmf; - int tmp; - - vmf.virtual_address = - (void __user *)(address & PAGE_MASK); - vmf.pgoff = pgoff; - vmf.flags = flags; - vmf.page = fault_page; - - unlock_page(page); - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); - if (unlikely(tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - ret = tmp; - goto unwritable_page; - } - if (unlikely(!(tmp & VM_FAULT_LOCKED))) { - lock_page(page); - if (!page->mapping) { - ret = 0; /* retry the fault */ - unlock_page(page); - goto unwritable_page; - } - } else - VM_BUG_ON_PAGE(!PageLocked(page), page); - page_mkwrite = 1; - } - } + if (!vma->vm_ops->page_mkwrite) + goto set_pte; - } + unlock_page(fault_page); + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = pgoff; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = fault_page; - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + tmp = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + page_cache_release(fault_page); + return tmp; + } - /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if FAULT_FLAG_WRITE is set, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. - */ - /* Only go through if we didn't race with anybody else... */ - if (likely(pte_same(*page_table, orig_pte))) { - flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); - if (flags & FAULT_FLAG_WRITE) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) - pte_mksoft_dirty(entry); - if (anon) { - inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); - } else { - inc_mm_counter_fast(mm, MM_FILEPAGES); - page_add_file_rmap(page); - if (flags & FAULT_FLAG_WRITE) { - dirty_page = page; - get_page(dirty_page); - } + if (unlikely(!(tmp & VM_FAULT_LOCKED))) { + lock_page(fault_page); + if (!fault_page->mapping) { + unlock_page(fault_page); + page_cache_release(fault_page); + return 0; /* retry */ } - set_pte_at(mm, address, page_table, entry); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, page_table); - } else { - if (cow_page) - mem_cgroup_uncharge_page(cow_page); - if (anon) - page_cache_release(page); - else - anon = 1; /* no anon but release faulted_page */ + } else + VM_BUG_ON_PAGE(!PageLocked(fault_page), fault_page); +set_pte: + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; } - pte_unmap_unlock(page_table, ptl); - - if (dirty_page) { - struct address_space *mapping = page->mapping; - int dirtied = 0; + flush_icache_page(vma, fault_page); + entry = mk_pte(fault_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + inc_mm_counter_fast(mm, MM_FILEPAGES); + page_add_file_rmap(fault_page); + set_pte_at(mm, address, pte, entry); - if (set_page_dirty(dirty_page)) - dirtied = 1; - unlock_page(dirty_page); - put_page(dirty_page); - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping but still - * dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, pte); + pte_unmap_unlock(pte, ptl); - /* file_update_time outside page_lock */ - if (vma->vm_file && !page_mkwrite) - file_update_time(vma->vm_file); - } else { - unlock_page(fault_page); - if (anon) - page_cache_release(fault_page); + if (set_page_dirty(fault_page)) + dirtied = 1; + mapping = fault_page->mapping; + unlock_page(fault_page); + if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping but still + * dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); } - return ret; + /* file_update_time outside page_lock */ + if (vma->vm_file && !vma->vm_ops->page_mkwrite) + file_update_time(vma->vm_file); -unwritable_page: - page_cache_release(page); - return ret; -uncharge_out: - /* fs's fault handler get error */ - if (cow_page) { - mem_cgroup_uncharge_page(cow_page); - page_cache_release(cow_page); - } return ret; } @@ -3609,7 +3507,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!(vma->vm_flags & VM_SHARED)) return do_cow_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); - return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } /* @@ -3647,7 +3545,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!(vma->vm_flags & VM_SHARED)) return do_cow_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); - return do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |