summaryrefslogtreecommitdiff
path: root/mm/madvise.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/madvise.c')
-rw-r--r--mm/madvise.c240
1 files changed, 170 insertions, 70 deletions
diff --git a/mm/madvise.c b/mm/madvise.c
index 49f3a75046f6..b17f684322ad 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -387,7 +387,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
folio = pmd_folio(orig_pmd);
/* Do not interfere with other mappings of this folio */
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
goto huge_unlock;
if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -486,7 +486,7 @@ restart:
if (nr < folio_nr_pages(folio)) {
int err;
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
continue;
if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
@@ -721,7 +721,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (nr < folio_nr_pages(folio)) {
int err;
- if (folio_likely_mapped_shared(folio))
+ if (folio_maybe_mapped_shared(folio))
continue;
if (!folio_trylock(folio))
continue;
@@ -933,7 +933,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
end = vma->vm_end;
}
- VM_WARN_ON(start >= end);
+ /*
+ * If the memory region between start and end was
+ * originally backed by 4kB pages and then remapped to
+ * be backed by hugepages while mmap_lock was dropped,
+ * the adjustment for hugetlb vma above may have rounded
+ * end down to the start address.
+ */
+ if (start == end)
+ return 0;
+ VM_WARN_ON(start > end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
@@ -1042,13 +1051,7 @@ static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
if (!allow_locked)
disallowed |= VM_LOCKED;
- if (!vma_is_anonymous(vma))
- return false;
-
- if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE)
- return false;
-
- return true;
+ return !(vma->vm_flags & disallowed);
}
static bool is_guard_pte_marker(pte_t ptent)
@@ -1389,7 +1392,32 @@ static int madvise_inject_error(int behavior,
return 0;
}
-#endif
+
+static bool is_memory_failure(int behavior)
+{
+ switch (behavior) {
+ case MADV_HWPOISON:
+ case MADV_SOFT_OFFLINE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+#else
+
+static int madvise_inject_error(int behavior,
+ unsigned long start, unsigned long end)
+{
+ return 0;
+}
+
+static bool is_memory_failure(int behavior)
+{
+ return false;
+}
+
+#endif /* CONFIG_MEMORY_FAILURE */
static bool
madvise_behavior_valid(int behavior)
@@ -1565,6 +1593,111 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
+
+static int madvise_lock(struct mm_struct *mm, int behavior)
+{
+ if (is_memory_failure(behavior))
+ return 0;
+
+ if (madvise_need_mmap_write(behavior)) {
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ } else {
+ mmap_read_lock(mm);
+ }
+ return 0;
+}
+
+static void madvise_unlock(struct mm_struct *mm, int behavior)
+{
+ if (is_memory_failure(behavior))
+ return;
+
+ if (madvise_need_mmap_write(behavior))
+ mmap_write_unlock(mm);
+ else
+ mmap_read_unlock(mm);
+}
+
+static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
+{
+ size_t len;
+
+ if (!madvise_behavior_valid(behavior))
+ return false;
+
+ if (!PAGE_ALIGNED(start))
+ return false;
+ len = PAGE_ALIGN(len_in);
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return false;
+
+ if (start + len < start)
+ return false;
+
+ return true;
+}
+
+/*
+ * madvise_should_skip() - Return if the request is invalid or nothing.
+ * @start: Start address of madvise-requested address range.
+ * @len_in: Length of madvise-requested address range.
+ * @behavior: Requested madvise behavor.
+ * @err: Pointer to store an error code from the check.
+ *
+ * If the specified behaviour is invalid or nothing would occur, we skip the
+ * operation. This function returns true in the cases, otherwise false. In
+ * the former case we store an error on @err.
+ */
+static bool madvise_should_skip(unsigned long start, size_t len_in,
+ int behavior, int *err)
+{
+ if (!is_valid_madvise(start, len_in, behavior)) {
+ *err = -EINVAL;
+ return true;
+ }
+ if (start + PAGE_ALIGN(len_in) == start) {
+ *err = 0;
+ return true;
+ }
+ return false;
+}
+
+static bool is_madvise_populate(int behavior)
+{
+ switch (behavior) {
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int madvise_do_behavior(struct mm_struct *mm,
+ unsigned long start, size_t len_in, int behavior)
+{
+ struct blk_plug plug;
+ unsigned long end;
+ int error;
+
+ if (is_memory_failure(behavior))
+ return madvise_inject_error(behavior, start, start + len_in);
+ start = untagged_addr_remote(mm, start);
+ end = start + PAGE_ALIGN(len_in);
+
+ blk_start_plug(&plug);
+ if (is_madvise_populate(behavior))
+ error = madvise_populate(mm, start, end, behavior);
+ else
+ error = madvise_walk_vmas(mm, start, end, behavior,
+ madvise_vma_behavior);
+ blk_finish_plug(&plug);
+ return error;
+}
+
/*
* The madvise(2) system call.
*
@@ -1639,63 +1772,15 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
- unsigned long end;
int error;
- int write;
- size_t len;
- struct blk_plug plug;
-
- if (!madvise_behavior_valid(behavior))
- return -EINVAL;
-
- if (!PAGE_ALIGNED(start))
- return -EINVAL;
- len = PAGE_ALIGN(len_in);
-
- /* Check to see whether len was rounded up from small -ve to zero */
- if (len_in && !len)
- return -EINVAL;
- end = start + len;
- if (end < start)
- return -EINVAL;
-
- if (end == start)
- return 0;
-
-#ifdef CONFIG_MEMORY_FAILURE
- if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
- return madvise_inject_error(behavior, start, start + len_in);
-#endif
-
- write = madvise_need_mmap_write(behavior);
- if (write) {
- if (mmap_write_lock_killable(mm))
- return -EINTR;
- } else {
- mmap_read_lock(mm);
- }
-
- start = untagged_addr_remote(mm, start);
- end = start + len;
-
- blk_start_plug(&plug);
- switch (behavior) {
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- error = madvise_populate(mm, start, end, behavior);
- break;
- default:
- error = madvise_walk_vmas(mm, start, end, behavior,
- madvise_vma_behavior);
- break;
- }
- blk_finish_plug(&plug);
-
- if (write)
- mmap_write_unlock(mm);
- else
- mmap_read_unlock(mm);
+ if (madvise_should_skip(start, len_in, behavior, &error))
+ return error;
+ error = madvise_lock(mm, behavior);
+ if (error)
+ return error;
+ error = madvise_do_behavior(mm, start, len_in, behavior);
+ madvise_unlock(mm, behavior);
return error;
}
@@ -1714,16 +1799,26 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
total_len = iov_iter_count(iter);
+ ret = madvise_lock(mm, behavior);
+ if (ret)
+ return ret;
+
while (iov_iter_count(iter)) {
- ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
- iter_iov_len(iter), behavior);
+ unsigned long start = (unsigned long)iter_iov_addr(iter);
+ size_t len_in = iter_iov_len(iter);
+ int error;
+
+ if (madvise_should_skip(start, len_in, behavior, &error))
+ ret = error;
+ else
+ ret = madvise_do_behavior(mm, start, len_in, behavior);
/*
* An madvise operation is attempting to restart the syscall,
* but we cannot proceed as it would not be correct to repeat
* the operation in aggregate, and would be surprising to the
* user.
*
- * As we have already dropped locks, it is safe to just loop and
+ * We drop and reacquire locks so it is safe to just loop and
* try again. We check for fatal signals in case we need exit
* early anyway.
*/
@@ -1732,12 +1827,17 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
ret = -EINTR;
break;
}
+
+ /* Drop and reacquire lock to unwind race. */
+ madvise_unlock(mm, behavior);
+ madvise_lock(mm, behavior);
continue;
}
if (ret < 0)
break;
iov_iter_advance(iter, iter_iov_len(iter));
}
+ madvise_unlock(mm, behavior);
ret = (total_len - iov_iter_count(iter)) ? : ret;