summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--mm/hugetlb.c172
1 files changed, 145 insertions, 27 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 72acbb29d2cc..65616941a383 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -43,6 +43,16 @@ static DEFINE_SPINLOCK(hugetlb_lock);
/*
* Region tracking -- allows tracking of reservations and instantiated pages
* across the pages in a mapping.
+ *
+ * The region data structures are protected by a combination of the mmap_sem
+ * and the hugetlb_instantion_mutex. To access or modify a region the caller
+ * must either hold the mmap_sem for write, or the mmap_sem for read and
+ * the hugetlb_instantiation mutex:
+ *
+ * down_write(&mm->mmap_sem);
+ * or
+ * down_read(&mm->mmap_sem);
+ * mutex_lock(&hugetlb_instantiation_mutex);
*/
struct file_region {
struct list_head link;
@@ -165,6 +175,30 @@ static long region_truncate(struct list_head *head, long end)
return chg;
}
+static long region_count(struct list_head *head, long f, long t)
+{
+ struct file_region *rg;
+ long chg = 0;
+
+ /* Locate each segment we overlap with, and count that overlap. */
+ list_for_each_entry(rg, head, link) {
+ int seg_from;
+ int seg_to;
+
+ if (rg->to <= f)
+ continue;
+ if (rg->from >= t)
+ break;
+
+ seg_from = max(rg->from, f);
+ seg_to = min(rg->to, t);
+
+ chg += seg_to - seg_from;
+ }
+
+ return chg;
+}
+
/*
* Convert the address within this vma to the page offset within
* the mapping, in base page units.
@@ -187,9 +221,15 @@ static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma,
(vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
}
-#define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1))
-#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2))
+/*
+ * Flags for MAP_PRIVATE reservations. These are stored in the bottom
+ * bits of the reservation map pointer, which are always clear due to
+ * alignment.
+ */
+#define HPAGE_RESV_OWNER (1UL << 0)
+#define HPAGE_RESV_UNMAPPED (1UL << 1)
#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
+
/*
* These helpers are used to track how many pages are reserved for
* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
@@ -199,6 +239,15 @@ static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma,
* the reserve counters are updated with the hugetlb_lock held. It is safe
* to reset the VMA at fork() time as it is not in use yet and there is no
* chance of the global counters getting corrupted as a result of the values.
+ *
+ * The private mapping reservation is represented in a subtly different
+ * manner to a shared mapping. A shared mapping has a region map associated
+ * with the underlying file, this region map represents the backing file
+ * pages which have ever had a reservation assigned which this persists even
+ * after the page is instantiated. A private mapping has a region map
+ * associated with the original mmap which is attached to all VMAs which
+ * reference it, this region map represents those offsets which have consumed
+ * reservation ie. where pages have been instantiated.
*/
static unsigned long get_vma_private_data(struct vm_area_struct *vma)
{
@@ -211,22 +260,48 @@ static void set_vma_private_data(struct vm_area_struct *vma,
vma->vm_private_data = (void *)value;
}
-static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
+struct resv_map {
+ struct kref refs;
+ struct list_head regions;
+};
+
+struct resv_map *resv_map_alloc(void)
+{
+ struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
+ if (!resv_map)
+ return NULL;
+
+ kref_init(&resv_map->refs);
+ INIT_LIST_HEAD(&resv_map->regions);
+
+ return resv_map;
+}
+
+void resv_map_release(struct kref *ref)
+{
+ struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+
+ /* Clear out any active regions before we release the map. */
+ region_truncate(&resv_map->regions, 0);
+ kfree(resv_map);
+}
+
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
VM_BUG_ON(!is_vm_hugetlb_page(vma));
if (!(vma->vm_flags & VM_SHARED))
- return get_vma_private_data(vma) & ~HPAGE_RESV_MASK;
+ return (struct resv_map *)(get_vma_private_data(vma) &
+ ~HPAGE_RESV_MASK);
return 0;
}
-static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
- unsigned long reserve)
+static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
{
VM_BUG_ON(!is_vm_hugetlb_page(vma));
VM_BUG_ON(vma->vm_flags & VM_SHARED);
- set_vma_private_data(vma,
- (get_vma_private_data(vma) & HPAGE_RESV_MASK) | reserve);
+ set_vma_private_data(vma, (get_vma_private_data(vma) &
+ HPAGE_RESV_MASK) | (unsigned long)map);
}
static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
@@ -253,19 +328,12 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
if (vma->vm_flags & VM_SHARED) {
/* Shared mappings always use reserves */
resv_huge_pages--;
- } else {
+ } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
/*
* Only the process that called mmap() has reserves for
* private mappings.
*/
- if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
- unsigned long flags, reserve;
- resv_huge_pages--;
- flags = (unsigned long)vma->vm_private_data &
- HPAGE_RESV_MASK;
- reserve = (unsigned long)vma->vm_private_data - 1;
- vma->vm_private_data = (void *)(reserve | flags);
- }
+ resv_huge_pages--;
}
}
@@ -282,7 +350,7 @@ static int vma_has_private_reserves(struct vm_area_struct *vma)
{
if (vma->vm_flags & VM_SHARED)
return 0;
- if (!vma_resv_huge_pages(vma))
+ if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return 0;
return 1;
}
@@ -742,12 +810,19 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
return region_chg(&inode->i_mapping->private_list,
idx, idx + 1);
- } else {
- if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER))
- return 1;
- }
+ } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ return 1;
- return 0;
+ } else {
+ int err;
+ pgoff_t idx = vma_pagecache_offset(vma, addr);
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ err = region_chg(&reservations->regions, idx, idx + 1);
+ if (err < 0)
+ return err;
+ return 0;
+ }
}
static void vma_commit_reservation(struct vm_area_struct *vma,
unsigned long addr)
@@ -758,6 +833,13 @@ static void vma_commit_reservation(struct vm_area_struct *vma,
if (vma->vm_flags & VM_SHARED) {
pgoff_t idx = vma_pagecache_offset(vma, addr);
region_add(&inode->i_mapping->private_list, idx, idx + 1);
+
+ } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ pgoff_t idx = vma_pagecache_offset(vma, addr);
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ /* Mark this page used in the map. */
+ region_add(&reservations->regions, idx, idx + 1);
}
}
@@ -1047,11 +1129,41 @@ out:
return ret;
}
+static void hugetlb_vm_op_open(struct vm_area_struct *vma)
+{
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ /*
+ * This new VMA should share its siblings reservation map if present.
+ * The VMA will only ever have a valid reservation map pointer where
+ * it is being copied for another still existing VMA. As that VMA
+ * has a reference to the reservation map it cannot dissappear until
+ * after this open call completes. It is therefore safe to take a
+ * new reference here without additional locking.
+ */
+ if (reservations)
+ kref_get(&reservations->refs);
+}
+
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
- unsigned long reserve = vma_resv_huge_pages(vma);
- if (reserve)
- hugetlb_acct_memory(-reserve);
+ struct resv_map *reservations = vma_resv_map(vma);
+ unsigned long reserve;
+ unsigned long start;
+ unsigned long end;
+
+ if (reservations) {
+ start = vma_pagecache_offset(vma, vma->vm_start);
+ end = vma_pagecache_offset(vma, vma->vm_end);
+
+ reserve = (end - start) -
+ region_count(&reservations->regions, start, end);
+
+ kref_put(&reservations->refs, resv_map_release);
+
+ if (reserve)
+ hugetlb_acct_memory(-reserve);
+ }
}
/*
@@ -1068,6 +1180,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct vm_operations_struct hugetlb_vm_ops = {
.fault = hugetlb_vm_op_fault,
+ .open = hugetlb_vm_op_open,
.close = hugetlb_vm_op_close,
};
@@ -1617,8 +1730,13 @@ int hugetlb_reserve_pages(struct inode *inode,
if (!vma || vma->vm_flags & VM_SHARED)
chg = region_chg(&inode->i_mapping->private_list, from, to);
else {
+ struct resv_map *resv_map = resv_map_alloc();
+ if (!resv_map)
+ return -ENOMEM;
+
chg = to - from;
- set_vma_resv_huge_pages(vma, chg);
+
+ set_vma_resv_map(vma, resv_map);
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}