mm, pcp: reduce detecting time of consecutive high order page freeing

In current PCP auto-tuning design, if the number of pages allocated is much more than that of pages freed on a CPU, the PCP high may become the maximal value even if the allocating/freeing depth is small, for example, in the sender of network workloads. If a CPU was used as sender originally, then it is used as receiver after context switching, we need to fill the whole PCP with maximal high before triggering PCP draining for consecutive high order freeing. This will hurt the performance of some network workloads. To solve the issue, in this patch, we will track the consecutive page freeing with a counter in stead of relying on PCP draining. So, we can detect consecutive page freeing much earlier. On a 2-socket Intel server with 128 logical CPU, we tested SCTP_STREAM_MANY test case of netperf test suite with 64-pair processes. With the patch, the network bandwidth improves 5.0%. This restores the performance drop caused by PCP auto-tuning. Link: https://lkml.kernel.org/r/20231016053002.756205-10-ying.huang@intel.com Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: David Hildenbrand <david@redhat.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Christoph Lameter <cl@linux.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Sudeep Holla <sudeep.holla@arm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
author: Huang Ying <ying.huang@intel.com> 2023-10-16 13:30:02 +0800
committer: Andrew Morton <akpm@linux-foundation.org> 2023-10-25 16:47:11 -0700
commit: 6ccdcb6d3a741c4e005ca6ffd4a62ddf8b5bead3 (patch)
tree: e19bac244b54e4f19758f68452ba05359dc0348c /mm
parent: 57c0419c5f0ea2ccab8700895c8fac20ba1eb21f (diff)
download: lwn-6ccdcb6d3a741c4e005ca6ffd4a62ddf8b5bead3.tar.gz
lwn-6ccdcb6d3a741c4e005ca6ffd4a62ddf8b5bead3.zip
1 files changed, 15 insertions, 12 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 58ab8389da05..d52718284029 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2369,13 +2369,10 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free
 	max_nr_free = high - batch;
 
 	/*
-	 * Double the number of pages freed each time there is subsequent
-	 * freeing of pages without any allocation.
+	 * Increase the batch number to the number of the consecutive
+	 * freed pages to reduce zone lock contention.
 	 */
-	batch <<= pcp->free_factor;
-	if (batch <= max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX)
-		pcp->free_factor++;
-	batch = clamp(batch, min_nr_free, max_nr_free);
+	batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free);
 
 	return batch;
 }
@@ -2403,7 +2400,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 	 * stored on pcp lists
 	 */
 	if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
-		pcp->high = max(high - (batch << pcp->free_factor), high_min);
+		int free_count = max_t(int, pcp->free_count, batch);
+
+		pcp->high = max(high - free_count, high_min);
 		return min(batch << 2, pcp->high);
 	}
 
@@ -2411,10 +2410,12 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 		return high;
 
 	if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) {
-		pcp->high = max(high - (batch << pcp->free_factor), high_min);
+		int free_count = max_t(int, pcp->free_count, batch);
+
+		pcp->high = max(high - free_count, high_min);
 		high = max(pcp->count, high_min);
 	} else if (pcp->count >= high) {
-		int need_high = (batch << pcp->free_factor) + batch;
+		int need_high = pcp->free_count + batch;
 
 		/* pcp->high should be large enough to hold batch freed pages */
 		if (pcp->high < need_high)
@@ -2451,7 +2452,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	 * stops will be drained from vmstat refresh context.
 	 */
 	if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
-		free_high = (pcp->free_factor &&
+		free_high = (pcp->free_count >= batch &&
 			     (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
 			     (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
 			      pcp->count >= READ_ONCE(batch)));
@@ -2459,6 +2460,8 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
 		pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
 	}
+	if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
+		pcp->free_count += (1 << order);
 	high = nr_pcp_high(pcp, zone, batch, free_high);
 	if (pcp->count >= high) {
 		free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@@ -2855,7 +2858,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	 * See nr_pcp_free() where free_factor is increased for subsequent
 	 * frees.
 	 */
-	pcp->free_factor >>= 1;
+	pcp->free_count >>= 1;
 	list = &pcp->lists[order_to_pindex(migratetype, order)];
 	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
 	pcp_spin_unlock(pcp);
@@ -5488,7 +5491,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
 	pcp->high_min = BOOT_PAGESET_HIGH;
 	pcp->high_max = BOOT_PAGESET_HIGH;
 	pcp->batch = BOOT_PAGESET_BATCH;
-	pcp->free_factor = 0;
+	pcp->free_count = 0;
 }
 
 static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
author	Huang Ying <ying.huang@intel.com>	2023-10-16 13:30:02 +0800
committer	Andrew Morton <akpm@linux-foundation.org>	2023-10-25 16:47:11 -0700
commit	6ccdcb6d3a741c4e005ca6ffd4a62ddf8b5bead3 (patch)
tree	e19bac244b54e4f19758f68452ba05359dc0348c /mm
parent	57c0419c5f0ea2ccab8700895c8fac20ba1eb21f (diff)
download	lwn-6ccdcb6d3a741c4e005ca6ffd4a62ddf8b5bead3.tar.gz lwn-6ccdcb6d3a741c4e005ca6ffd4a62ddf8b5bead3.zip