diff options
author | Andi Kleen <ak@suse.de> | 2008-07-23 21:27:47 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-24 10:47:18 -0700 |
commit | aa888a74977a8f2120ae9332376e179c39a6b07d (patch) | |
tree | 1834f8a81e0126ffdd9d9622a9522331dffa2ac8 | |
parent | 01ad1c0827db5b3695c53e296dbb2c1da16a0911 (diff) | |
download | lwn-aa888a74977a8f2120ae9332376e179c39a6b07d.tar.gz lwn-aa888a74977a8f2120ae9332376e179c39a6b07d.zip |
hugetlb: support larger than MAX_ORDER
This is needed on x86-64 to handle GB pages in hugetlbfs, because it is
not practical to enlarge MAX_ORDER to 1GB.
Instead the 1GB pages are only allocated at boot using the bootmem
allocator using the hugepages=... option.
These 1G bootmem pages are never freed. In theory it would be possible to
implement that with some complications, but since it would be a one-way
street (>= MAX_ORDER pages cannot be allocated later) I decided not to
currently.
The >= MAX_ORDER code is not ifdef'ed per architecture. It is not very
big and the ifdef uglyness seemed not be worth it.
Known problems: /proc/meminfo and "free" do not display the memory
allocated for gb pages in "Total". This is a little confusing for the
user.
Acked-by: Andrew Hastings <abh@cray.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/hugetlb.c | 83 |
1 files changed, 81 insertions, 2 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5e620e25cf08..1a6fe87555b2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -14,6 +14,7 @@ #include <linux/mempolicy.h> #include <linux/cpuset.h> #include <linux/mutex.h> +#include <linux/bootmem.h> #include <linux/sysfs.h> #include <asm/page.h> @@ -489,7 +490,7 @@ static void free_huge_page(struct page *page) INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); - if (h->surplus_huge_pages_node[nid]) { + if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { update_and_free_page(h, page); h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; @@ -550,6 +551,9 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; + if (h->order >= MAX_ORDER) + return NULL; + page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, @@ -616,6 +620,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, struct page *page; unsigned int nid; + if (h->order >= MAX_ORDER) + return NULL; + /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed @@ -792,6 +799,10 @@ static void return_unused_surplus_pages(struct hstate *h, /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; + /* Cannot return gigantic pages currently */ + if (h->order >= MAX_ORDER) + return; + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); while (remaining_iterations-- && nr_pages) { @@ -913,6 +924,63 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } +static __initdata LIST_HEAD(huge_boot_pages); + +struct huge_bootmem_page { + struct list_head list; + struct hstate *hstate; +}; + +static int __init alloc_bootmem_huge_page(struct hstate *h) +{ + struct huge_bootmem_page *m; + int nr_nodes = nodes_weight(node_online_map); + + while (nr_nodes) { + void *addr; + + addr = __alloc_bootmem_node_nopanic( + NODE_DATA(h->hugetlb_next_nid), + huge_page_size(h), huge_page_size(h), 0); + + if (addr) { + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + */ + m = addr; + if (m) + goto found; + } + hstate_next_node(h); + nr_nodes--; + } + return 0; + +found: + BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); + /* Put them into a private list first because mem_map is not up yet */ + list_add(&m->list, &huge_boot_pages); + m->hstate = h; + return 1; +} + +/* Put bootmem huge pages into the standard lists after mem_map is up */ +static void __init gather_bootmem_prealloc(void) +{ + struct huge_bootmem_page *m; + + list_for_each_entry(m, &huge_boot_pages, list) { + struct page *page = virt_to_page(m); + struct hstate *h = m->hstate; + __ClearPageReserved(page); + WARN_ON(page_count(page) != 1); + prep_compound_page(page, h->order); + prep_new_huge_page(h, page, page_to_nid(page)); + } +} + static void __init hugetlb_init_one_hstate(struct hstate *h) { unsigned long i; @@ -923,7 +991,10 @@ static void __init hugetlb_init_one_hstate(struct hstate *h) h->hugetlb_next_nid = first_node(node_online_map); for (i = 0; i < h->max_huge_pages; ++i) { - if (!alloc_fresh_huge_page(h)) + if (h->order >= MAX_ORDER) { + if (!alloc_bootmem_huge_page(h)) + break; + } else if (!alloc_fresh_huge_page(h)) break; } h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; @@ -956,6 +1027,9 @@ static void try_to_free_low(struct hstate *h, unsigned long count) { int i; + if (h->order >= MAX_ORDER) + return; + for (i = 0; i < MAX_NUMNODES; ++i) { struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; @@ -982,6 +1056,9 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { unsigned long min_count, ret; + if (h->order >= MAX_ORDER) + return h->max_huge_pages; + /* * Increase the pool size * First take pages out of surplus state. Then make up the @@ -1210,6 +1287,8 @@ static int __init hugetlb_init(void) hugetlb_init_hstates(); + gather_bootmem_prealloc(); + report_hugepages(); hugetlb_sysfs_init(); |