summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/core-api/memory-hotplug.rst83
-rw-r--r--drivers/base/node.c21
-rw-r--r--include/linux/node.h40
-rw-r--r--mm/memory_hotplug.c146
4 files changed, 210 insertions, 80 deletions
diff --git a/Documentation/core-api/memory-hotplug.rst b/Documentation/core-api/memory-hotplug.rst
index d1b8eb9add8a..fb84e78968b2 100644
--- a/Documentation/core-api/memory-hotplug.rst
+++ b/Documentation/core-api/memory-hotplug.rst
@@ -9,6 +9,9 @@ Memory hotplug event notifier
Hotplugging events are sent to a notification queue.
+Memory notifier
+----------------
+
There are six types of notification defined in ``include/linux/memory.h``:
MEM_GOING_ONLINE
@@ -68,6 +71,14 @@ The third argument (arg) passes a pointer of struct memory_notify::
If status_changed_nid* >= 0, callback should create/discard structures for the
node if necessary.
+It is possible to get notified for MEM_CANCEL_ONLINE without having been notified
+for MEM_GOING_ONLINE, and the same applies to MEM_CANCEL_OFFLINE and
+MEM_GOING_OFFLINE.
+This can happen when a consumer fails, meaning we break the callchain and we
+stop calling the remaining consumers of the notifier.
+It is then important that users of memory_notify make no assumptions and get
+prepared to handle such cases.
+
The callback routine shall return one of the values
NOTIFY_DONE, NOTIFY_OK, NOTIFY_BAD, NOTIFY_STOP
defined in ``include/linux/notifier.h``
@@ -80,6 +91,78 @@ further processing of the notification queue.
NOTIFY_STOP stops further processing of the notification queue.
+Numa node notifier
+------------------
+
+There are six types of notification defined in ``include/linux/node.h``:
+
+NODE_ADDING_FIRST_MEMORY
+ Generated before memory becomes available to this node for the first time.
+
+NODE_CANCEL_ADDING_FIRST_MEMORY
+ Generated if NODE_ADDING_FIRST_MEMORY fails.
+
+NODE_ADDED_FIRST_MEMORY
+ Generated when memory has become available fo this node for the first time.
+
+NODE_REMOVING_LAST_MEMORY
+ Generated when the last memory available to this node is about to be offlined.
+
+NODE_CANCEL_REMOVING_LAST_MEMORY
+ Generated when NODE_CANCEL_REMOVING_LAST_MEMORY fails.
+
+NODE_REMOVED_LAST_MEMORY
+ Generated when the last memory available to this node has been offlined.
+
+A callback routine can be registered by calling::
+
+ hotplug_node_notifier(callback_func, priority)
+
+Callback functions with higher values of priority are called before callback
+functions with lower values.
+
+A callback function must have the following prototype::
+
+ int callback_func(
+
+ struct notifier_block *self, unsigned long action, void *arg);
+
+The first argument of the callback function (self) is a pointer to the block
+of the notifier chain that points to the callback function itself.
+The second argument (action) is one of the event types described above.
+The third argument (arg) passes a pointer of struct node_notify::
+
+ struct node_notify {
+ int nid;
+ }
+
+- nid is the node we are adding or removing memory to.
+
+It is possible to get notified for NODE_CANCEL_ADDING_FIRST_MEMORY without
+having been notified for NODE_ADDING_FIRST_MEMORY, and the same applies to
+NODE_CANCEL_REMOVING_LAST_MEMORY and NODE_REMOVING_LAST_MEMORY.
+This can happen when a consumer fails, meaning we break the callchain and we
+stop calling the remaining consumers of the notifier.
+It is then important that users of node_notify make no assumptions and get
+prepared to handle such cases.
+
+The callback routine shall return one of the values
+NOTIFY_DONE, NOTIFY_OK, NOTIFY_BAD, NOTIFY_STOP
+defined in ``include/linux/notifier.h``
+
+NOTIFY_DONE and NOTIFY_OK have no effect on the further processing.
+
+NOTIFY_BAD is used as response to the NODE_ADDING_FIRST_MEMORY,
+NODE_REMOVING_LAST_MEMORY, NODE_ADDED_FIRST_MEMORY or
+NODE_REMOVED_LAST_MEMORY action to cancel hotplugging.
+It stops further processing of the notification queue.
+
+NOTIFY_STOP stops further processing of the notification queue.
+
+Please note that we should not fail for NODE_ADDED_FIRST_MEMORY /
+NODE_REMOVED_FIRST_MEMORY, as memory_hotplug code cannot rollback at that
+point anymore.
+
Locking Internals
=================
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6cbeca45c451..6d66382dae65 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -112,6 +112,27 @@ static const struct attribute_group *node_access_node_groups[] = {
NULL,
};
+#ifdef CONFIG_MEMORY_HOTPLUG
+static BLOCKING_NOTIFIER_HEAD(node_chain);
+
+int register_node_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&node_chain, nb);
+}
+EXPORT_SYMBOL(register_node_notifier);
+
+void unregister_node_notifier(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&node_chain, nb);
+}
+EXPORT_SYMBOL(unregister_node_notifier);
+
+int node_notify(unsigned long val, void *v)
+{
+ return blocking_notifier_call_chain(&node_chain, val, v);
+}
+#endif
+
static void node_remove_accesses(struct node *node)
{
struct node_access_nodes *c, *cnext;
diff --git a/include/linux/node.h b/include/linux/node.h
index 88bceebcbfa5..2c7529335b21 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -125,6 +125,46 @@ static inline void register_memory_blocks_under_nodes(void)
#endif
extern void unregister_node(struct node *node);
+
+struct node_notify {
+ int nid;
+};
+
+#define NODE_ADDING_FIRST_MEMORY (1<<0)
+#define NODE_ADDED_FIRST_MEMORY (1<<1)
+#define NODE_CANCEL_ADDING_FIRST_MEMORY (1<<2)
+#define NODE_REMOVING_LAST_MEMORY (1<<3)
+#define NODE_REMOVED_LAST_MEMORY (1<<4)
+#define NODE_CANCEL_REMOVING_LAST_MEMORY (1<<5)
+
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
+extern int register_node_notifier(struct notifier_block *nb);
+extern void unregister_node_notifier(struct notifier_block *nb);
+extern int node_notify(unsigned long val, void *v);
+
+#define hotplug_node_notifier(fn, pri) ({ \
+ static __meminitdata struct notifier_block fn##_node_nb =\
+ { .notifier_call = fn, .priority = pri };\
+ register_node_notifier(&fn##_node_nb); \
+})
+#else
+static inline int register_node_notifier(struct notifier_block *nb)
+{
+ return 0;
+}
+static inline void unregister_node_notifier(struct notifier_block *nb)
+{
+}
+static inline int node_notify(unsigned long val, void *v)
+{
+ return 0;
+}
+static inline int hotplug_node_notifier(notifier_fn_t fn, int pri)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_NUMA
extern void node_dev_init(void);
/* Core of the node registration - only memory hotplug should use this */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d11278c3840d..0d54a01985ba 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -35,6 +35,7 @@
#include <linux/compaction.h>
#include <linux/rmap.h>
#include <linux/module.h>
+#include <linux/node.h>
#include <asm/tlbflush.h>
@@ -699,24 +700,6 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
online_mem_sections(start_pfn, end_pfn);
}
-/* check which state of node_states will be changed when online memory */
-static void node_states_check_changes_online(unsigned long nr_pages,
- struct zone *zone, struct memory_notify *arg)
-{
- int nid = zone_to_nid(zone);
-
- arg->status_change_nid = NUMA_NO_NODE;
-
- if (!node_state(nid, N_MEMORY))
- arg->status_change_nid = nid;
-}
-
-static void node_states_set_node(int node, struct memory_notify *arg)
-{
- if (arg->status_change_nid >= 0)
- node_set_state(node, N_MEMORY);
-}
-
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -1167,11 +1150,18 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
int online_pages(unsigned long pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
- unsigned long flags;
- int need_zonelists_rebuild = 0;
+ struct memory_notify mem_arg = {
+ .start_pfn = pfn,
+ .nr_pages = nr_pages,
+ .status_change_nid = NUMA_NO_NODE,
+ };
+ struct node_notify node_arg = {
+ .nid = NUMA_NO_NODE,
+ };
const int nid = zone_to_nid(zone);
+ int need_zonelists_rebuild = 0;
+ unsigned long flags;
int ret;
- struct memory_notify arg;
/*
* {on,off}lining is constrained to full memory sections (or more
@@ -1188,11 +1178,17 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
/* associate pfn range with the zone */
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
- arg.start_pfn = pfn;
- arg.nr_pages = nr_pages;
- node_states_check_changes_online(nr_pages, zone, &arg);
+ if (!node_state(nid, N_MEMORY)) {
+ /* Adding memory to the node for the first time */
+ node_arg.nid = nid;
+ mem_arg.status_change_nid = nid;
+ ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto failed_addition;
+ }
- ret = memory_notify(MEM_GOING_ONLINE, &arg);
+ ret = memory_notify(MEM_GOING_ONLINE, &mem_arg);
ret = notifier_to_errno(ret);
if (ret)
goto failed_addition;
@@ -1218,7 +1214,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
online_pages_range(pfn, nr_pages);
adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
- node_states_set_node(nid, &arg);
+ if (node_arg.nid >= 0)
+ node_set_state(nid, N_MEMORY);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
@@ -1239,16 +1236,22 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
kswapd_run(nid);
kcompactd_run(nid);
+ if (node_arg.nid >= 0)
+ /* First memory added successfully. Notify consumers. */
+ node_notify(NODE_ADDED_FIRST_MEMORY, &node_arg);
+
writeback_set_ratelimit();
- memory_notify(MEM_ONLINE, &arg);
+ memory_notify(MEM_ONLINE, &mem_arg);
return 0;
failed_addition:
pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
(unsigned long long) pfn << PAGE_SHIFT,
(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
- memory_notify(MEM_CANCEL_ONLINE, &arg);
+ memory_notify(MEM_CANCEL_ONLINE, &mem_arg);
+ if (node_arg.nid != NUMA_NO_NODE)
+ node_notify(NODE_CANCEL_ADDING_FIRST_MEMORY, &node_arg);
remove_pfn_range_from_zone(zone, pfn, nr_pages);
return ret;
}
@@ -1879,48 +1882,6 @@ static int __init cmdline_parse_movable_node(char *p)
}
early_param("movable_node", cmdline_parse_movable_node);
-/* check which state of node_states will be changed when offline memory */
-static void node_states_check_changes_offline(unsigned long nr_pages,
- struct zone *zone, struct memory_notify *arg)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long present_pages = 0;
- enum zone_type zt;
-
- arg->status_change_nid = NUMA_NO_NODE;
-
- /*
- * Check whether node_states[N_NORMAL_MEMORY] will be changed.
- * If the memory to be offline is within the range
- * [0..ZONE_NORMAL], and it is the last present memory there,
- * the zones in that range will become empty after the offlining,
- * thus we can determine that we need to clear the node from
- * node_states[N_NORMAL_MEMORY].
- */
- for (zt = 0; zt <= ZONE_NORMAL; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
-
- /*
- * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
- * does not apply as we don't support 32bit.
- * Here we count the possible pages from ZONE_MOVABLE.
- * If after having accounted all the pages, we see that the nr_pages
- * to be offlined is over or equal to the accounted pages,
- * we know that the node will become empty, and so, we can clear
- * it for N_MEMORY as well.
- */
- present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
-
- if (nr_pages >= present_pages)
- arg->status_change_nid = zone_to_nid(zone);
-}
-
-static void node_states_clear_node(int node, struct memory_notify *arg)
-{
- if (arg->status_change_nid >= 0)
- node_clear_state(node, N_MEMORY);
-}
-
static int count_system_ram_pages_cb(unsigned long start_pfn,
unsigned long nr_pages, void *data)
{
@@ -1936,11 +1897,19 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
- const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn, managed_pages, system_ram_pages = 0;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ struct pglist_data *pgdat = zone->zone_pgdat;
const int node = zone_to_nid(zone);
+ struct memory_notify mem_arg = {
+ .start_pfn = start_pfn,
+ .nr_pages = nr_pages,
+ .status_change_nid = NUMA_NO_NODE,
+ };
+ struct node_notify node_arg = {
+ .nid = NUMA_NO_NODE,
+ };
unsigned long flags;
- struct memory_notify arg;
char *reason;
int ret;
@@ -1999,11 +1968,23 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
goto failed_removal_pcplists_disabled;
}
- arg.start_pfn = start_pfn;
- arg.nr_pages = nr_pages;
- node_states_check_changes_offline(nr_pages, zone, &arg);
+ /*
+ * Check whether the node will have no present pages after we offline
+ * 'nr_pages' more. If so, we know that the node will become empty, and
+ * so we will clear N_MEMORY for it.
+ */
+ if (nr_pages >= pgdat->node_present_pages) {
+ node_arg.nid = node;
+ mem_arg.status_change_nid = node;
+ ret = node_notify(NODE_REMOVING_LAST_MEMORY, &node_arg);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ reason = "node notifier failure";
+ goto failed_removal_isolated;
+ }
+ }
- ret = memory_notify(MEM_GOING_OFFLINE, &arg);
+ ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg);
ret = notifier_to_errno(ret);
if (ret) {
reason = "notifier failure";
@@ -2083,27 +2064,32 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* Make sure to mark the node as memory-less before rebuilding the zone
* list. Otherwise this node would still appear in the fallback lists.
*/
- node_states_clear_node(node, &arg);
+ if (node_arg.nid >= 0)
+ node_clear_state(node, N_MEMORY);
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
build_all_zonelists(NULL);
}
- if (arg.status_change_nid >= 0) {
+ if (node_arg.nid >= 0) {
kcompactd_stop(node);
kswapd_stop(node);
+ /* Node went memoryless. Notify consumers */
+ node_notify(NODE_REMOVED_LAST_MEMORY, &node_arg);
}
writeback_set_ratelimit();
- memory_notify(MEM_OFFLINE, &arg);
+ memory_notify(MEM_OFFLINE, &mem_arg);
remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
return 0;
failed_removal_isolated:
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
- memory_notify(MEM_CANCEL_OFFLINE, &arg);
+ memory_notify(MEM_CANCEL_OFFLINE, &mem_arg);
+ if (node_arg.nid != NUMA_NO_NODE)
+ node_notify(NODE_CANCEL_REMOVING_LAST_MEMORY, &node_arg);
failed_removal_pcplists_disabled:
lru_cache_enable();
zone_pcp_enable(zone);